AlistairPullen
/

llama-3.1-8B-grpo

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/cosineai/deepseek-reproduce/runs/sxqwipa8)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/cosineai/deepseek-reproduce/runs/mr9qnll8)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.0007357805297391096,
-    "train_runtime": 5420.3326,
     "train_samples": 6725,
-    "train_samples_per_second": 2.214,
-    "train_steps_per_second": 0.046
 }

 {
     "total_flos": 0.0,
+    "train_loss": 3.1328258249329856e-06,
+    "train_runtime": 38.497,
     "train_samples": 6725,
+    "train_samples_per_second": 311.713,
+    "train_steps_per_second": 6.494
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.0007357805297391096,
-    "train_runtime": 5420.3326,
     "train_samples": 6725,
-    "train_samples_per_second": 2.214,
-    "train_steps_per_second": 0.046
 }

 {
     "total_flos": 0.0,
+    "train_loss": 3.1328258249329856e-06,
+    "train_runtime": 38.497,
     "train_samples": 6725,
+    "train_samples_per_second": 311.713,
+    "train_steps_per_second": 6.494
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.29739776951672864,
   "eval_steps": 500,
-  "global_step": 250,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -4009,13 +4009,29 @@
       "step": 250
     },
     {
-      "epoch": 0.29739776951672864,
-      "step": 250,
       "total_flos": 0.0,
-      "train_loss": 0.0007357805297391096,
-      "train_runtime": 5420.3326,
-      "train_samples_per_second": 2.214,
-      "train_steps_per_second": 0.046
     }
   ],
   "logging_steps": 1,

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.2985873605947955,
   "eval_steps": 500,
+  "global_step": 251,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 250
     },
     {
+      "completion_length": 171.3333396911621,
+      "epoch": 0.2985873605947955,
+      "grad_norm": 0.017227069093127906,
+      "kl": 0.01967620849609375,
+      "learning_rate": 2.43689976739403e-10,
+      "loss": 0.0008,
+      "reward": 0.5448333483655006,
+      "reward_std": 1.0098017808049917,
+      "rewards/correctness_reward_func": 0.4583333469927311,
+      "rewards/int_reward_func": 0.11458333674818277,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": -0.028083334676921368,
+      "step": 251
+    },
+    {
+      "epoch": 0.2985873605947955,
+      "step": 251,
       "total_flos": 0.0,
+      "train_loss": 3.1328258249329856e-06,
+      "train_runtime": 38.497,
+      "train_samples_per_second": 311.713,
+      "train_steps_per_second": 6.494
     }
   ],
   "logging_steps": 1,