lole25 commited on
Commit
c5be93f
1 Parent(s): 5cce308

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - dpo
9
  - generated_from_trainer
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  base_model: microsoft/phi-2
13
  model-index:
14
  - name: phi-2-dpo-ultrachat-lora
@@ -20,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # phi-2-dpo-ultrachat-lora
22
 
23
- This model is a fine-tuned version of [lole25/phi-2-sft-ultrachat-lora](https://huggingface.co/lole25/phi-2-sft-ultrachat-lora) on the HuggingFaceH4/ultrafeedback_binarized dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 0.6912
26
- - Rewards/chosen: -0.0072
27
- - Rewards/rejected: -0.0111
28
- - Rewards/accuracies: 0.3180
29
- - Rewards/margins: 0.0040
30
- - Logps/rejected: -95.3090
31
- - Logps/chosen: -92.4438
32
- - Logits/rejected: 0.8021
33
- - Logits/chosen: 0.7828
34
 
35
  ## Model description
36
 
@@ -61,16 +57,21 @@ The following hyperparameters were used during training:
61
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
  - lr_scheduler_type: cosine
63
  - lr_scheduler_warmup_ratio: 0.1
64
- - num_epochs: 1
65
 
66
  ### Training results
67
 
68
- | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
69
- |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
70
- | 0.693 | 0.21 | 100 | 0.6931 | -0.0005 | -0.0008 | 0.2680 | 0.0004 | -94.2804 | -91.7748 | 0.8176 | 0.7998 |
71
- | 0.6922 | 0.42 | 200 | 0.6924 | -0.0018 | -0.0032 | 0.3020 | 0.0014 | -94.5141 | -91.9068 | 0.8121 | 0.7941 |
72
- | 0.6917 | 0.63 | 300 | 0.6917 | -0.0049 | -0.0077 | 0.3100 | 0.0028 | -94.9659 | -92.2189 | 0.8057 | 0.7870 |
73
- | 0.6905 | 0.84 | 400 | 0.6913 | -0.0070 | -0.0105 | 0.3280 | 0.0036 | -95.2509 | -92.4247 | 0.8012 | 0.7827 |
 
 
 
 
 
74
 
75
 
76
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
8
  base_model: microsoft/phi-2
9
  model-index:
10
  - name: phi-2-dpo-ultrachat-lora
 
16
 
17
  # phi-2-dpo-ultrachat-lora
18
 
19
+ This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.6872
22
+ - Rewards/chosen: -0.0312
23
+ - Rewards/rejected: -0.0436
24
+ - Rewards/accuracies: 0.3340
25
+ - Rewards/margins: 0.0124
26
+ - Logps/rejected: -98.5542
27
+ - Logps/chosen: -94.8435
28
+ - Logits/rejected: 0.7532
29
+ - Logits/chosen: 0.7326
30
 
31
  ## Model description
32
 
 
57
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
58
  - lr_scheduler_type: cosine
59
  - lr_scheduler_warmup_ratio: 0.1
60
+ - num_epochs: 2
61
 
62
  ### Training results
63
 
64
+ | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
65
+ |:-------------:|:-----:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
66
+ | 0.693 | 0.21 | 100 | 0.7998 | 0.8176 | -91.7748 | -94.2804 | 0.6931 | 0.2680 | -0.0005 | 0.0004 | -0.0008 |
67
+ | 0.6922 | 0.42 | 200 | 0.7941 | 0.8121 | -91.9068 | -94.5141 | 0.6924 | 0.3020 | -0.0018 | 0.0014 | -0.0032 |
68
+ | 0.6917 | 0.63 | 300 | 0.7870 | 0.8057 | -92.2189 | -94.9659 | 0.6917 | 0.3100 | -0.0049 | 0.0028 | -0.0077 |
69
+ | 0.6905 | 0.84 | 400 | 0.7827 | 0.8012 | -92.4247 | -95.2509 | 0.6913 | 0.3280 | -0.0070 | 0.0036 | -0.0105 |
70
+ | 0.6898 | 1.05 | 500 | 0.6900 | -0.0142 | -0.0205 | 0.3360 | 0.0064 | -96.2490 | -93.1429 | 0.7903 | 0.7711 |
71
+ | 0.6882 | 1.26 | 600 | 0.6887 | -0.0217 | -0.0306 | 0.3340 | 0.0089 | -97.2594 | -93.8981 | 0.7722 | 0.7527 |
72
+ | 0.6858 | 1.47 | 700 | 0.6879 | -0.0274 | -0.0383 | 0.3280 | 0.0108 | -98.0249 | -94.4717 | 0.7600 | 0.7395 |
73
+ | 0.6857 | 1.67 | 800 | 0.6874 | -0.0303 | -0.0423 | 0.3340 | 0.0120 | -98.4270 | -94.7618 | 0.7548 | 0.7341 |
74
+ | 0.6866 | 1.88 | 900 | 0.6872 | -0.0313 | -0.0437 | 0.3420 | 0.0124 | -98.5655 | -94.8550 | 0.7528 | 0.7321 |
75
 
76
 
77
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fa1266189ba9f031fe9920131fb977dc48c5ca5c6db3dad77ed60ef0e45d05b
3
  size 41977616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03a564973f08ad1b996adb14089e1d24c9d0ffd3e6da6652e793b9c19210b312
3
  size 41977616
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_logits/chosen": 0.7827913761138916,
4
- "eval_logits/rejected": 0.8020623326301575,
5
- "eval_logps/chosen": -92.44380187988281,
6
- "eval_logps/rejected": -95.30902099609375,
7
- "eval_loss": 0.6912217736244202,
8
- "eval_rewards/accuracies": 0.3179999887943268,
9
- "eval_rewards/chosen": -0.00716440798714757,
10
- "eval_rewards/margins": 0.00395576748996973,
11
- "eval_rewards/rejected": -0.011120175942778587,
12
- "eval_runtime": 273.2522,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 7.319,
15
- "eval_steps_per_second": 0.457,
16
- "train_loss": 0.6920521804121805,
17
- "train_runtime": 6813.2628,
18
  "train_samples": 30567,
19
- "train_samples_per_second": 4.486,
20
- "train_steps_per_second": 0.07
21
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_logits/chosen": 0.7325530052185059,
4
+ "eval_logits/rejected": 0.7531598806381226,
5
+ "eval_logps/chosen": -94.8434829711914,
6
+ "eval_logps/rejected": -98.55415344238281,
7
+ "eval_loss": 0.687246561050415,
8
+ "eval_rewards/accuracies": 0.33399999141693115,
9
+ "eval_rewards/chosen": -0.03116113506257534,
10
+ "eval_rewards/margins": 0.012410260736942291,
11
+ "eval_rewards/rejected": -0.04357139766216278,
12
+ "eval_runtime": 272.5724,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 7.337,
15
+ "eval_steps_per_second": 0.459,
16
+ "train_loss": 0.3994182998029441,
17
+ "train_runtime": 8026.3391,
18
  "train_samples": 30567,
19
+ "train_samples_per_second": 7.617,
20
+ "train_steps_per_second": 0.119
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_logits/chosen": 0.7827913761138916,
4
- "eval_logits/rejected": 0.8020623326301575,
5
- "eval_logps/chosen": -92.44380187988281,
6
- "eval_logps/rejected": -95.30902099609375,
7
- "eval_loss": 0.6912217736244202,
8
- "eval_rewards/accuracies": 0.3179999887943268,
9
- "eval_rewards/chosen": -0.00716440798714757,
10
- "eval_rewards/margins": 0.00395576748996973,
11
- "eval_rewards/rejected": -0.011120175942778587,
12
- "eval_runtime": 273.2522,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 7.319,
15
- "eval_steps_per_second": 0.457
16
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_logits/chosen": 0.7325530052185059,
4
+ "eval_logits/rejected": 0.7531598806381226,
5
+ "eval_logps/chosen": -94.8434829711914,
6
+ "eval_logps/rejected": -98.55415344238281,
7
+ "eval_loss": 0.687246561050415,
8
+ "eval_rewards/accuracies": 0.33399999141693115,
9
+ "eval_rewards/chosen": -0.03116113506257534,
10
+ "eval_rewards/margins": 0.012410260736942291,
11
+ "eval_rewards/rejected": -0.04357139766216278,
12
+ "eval_runtime": 272.5724,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 7.337,
15
+ "eval_steps_per_second": 0.459
16
  }
runs/Mar01_10-22-14_gpu4-119-4/events.out.tfevents.1709249008.gpu4-119-4.1841365.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f82f02aa55fcc970e5d579ac9bdb2e52d9bb9e91dd1ec8ccf21d6bee8280044e
3
- size 40230
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53fa2c7156918af2e86794d36bc57146c6256b04f9f11982581c5d9d612686d6
3
+ size 43754
runs/Mar01_10-22-14_gpu4-119-4/events.out.tfevents.1709257307.gpu4-119-4.1841365.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fe03d74aa04b2919e7ad4d520740505dc619f70188a5eca953e98fc156aaf23
3
+ size 828
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.6920521804121805,
4
- "train_runtime": 6813.2628,
5
  "train_samples": 30567,
6
- "train_samples_per_second": 4.486,
7
- "train_steps_per_second": 0.07
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.3994182998029441,
4
+ "train_runtime": 8026.3391,
5
  "train_samples": 30567,
6
+ "train_samples_per_second": 7.617,
7
+ "train_steps_per_second": 0.119
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9984301412872841,
5
  "eval_steps": 100,
6
- "global_step": 477,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -648,116 +648,868 @@
648
  },
649
  {
650
  "epoch": 0.86,
651
- "learning_rate": 2.9492720416985004e-07,
652
- "logits/chosen": 0.8186850547790527,
653
- "logits/rejected": 0.8151271939277649,
654
- "logps/chosen": -95.82102966308594,
655
- "logps/rejected": -80.586669921875,
656
  "loss": 0.6916,
657
- "rewards/accuracies": 0.3375000059604645,
658
- "rewards/chosen": -0.00794359389692545,
659
- "rewards/margins": 0.003056485904380679,
660
- "rewards/rejected": -0.011000080034136772,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
- "learning_rate": 2.1464952759020857e-07,
666
- "logits/chosen": 0.7571959495544434,
667
- "logits/rejected": 0.8163139224052429,
668
- "logps/chosen": -95.65677642822266,
669
- "logps/rejected": -97.86299896240234,
670
- "loss": 0.6915,
671
- "rewards/accuracies": 0.3125,
672
- "rewards/chosen": -0.00859010498970747,
673
- "rewards/margins": 0.0028384437318891287,
674
- "rewards/rejected": -0.011428548023104668,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
- "learning_rate": 1.4662207078575685e-07,
680
- "logits/chosen": 0.8645572662353516,
681
- "logits/rejected": 0.887597918510437,
682
- "logps/chosen": -83.41182708740234,
683
- "logps/rejected": -90.17640686035156,
684
- "loss": 0.6913,
685
- "rewards/accuracies": 0.39375001192092896,
686
- "rewards/chosen": -0.006618577986955643,
687
- "rewards/margins": 0.004637080244719982,
688
- "rewards/rejected": -0.011255658231675625,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
- "learning_rate": 9.120948298936422e-08,
694
- "logits/chosen": 0.7825466394424438,
695
- "logits/rejected": 0.8328782916069031,
696
- "logps/chosen": -109.79942321777344,
697
- "logps/rejected": -82.07637023925781,
698
- "loss": 0.6911,
699
- "rewards/accuracies": 0.34375,
700
- "rewards/chosen": -0.005336672533303499,
701
- "rewards/margins": 0.005987245589494705,
702
- "rewards/rejected": -0.011323917657136917,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
- "learning_rate": 4.870879364444109e-08,
708
- "logits/chosen": 0.774901270866394,
709
- "logits/rejected": 0.8499285578727722,
710
- "logps/chosen": -81.24304962158203,
711
- "logps/rejected": -71.88755798339844,
712
- "loss": 0.6911,
713
- "rewards/accuracies": 0.29374998807907104,
714
- "rewards/chosen": -0.005921828560531139,
715
- "rewards/margins": 0.0035197760444134474,
716
- "rewards/rejected": -0.00944160483777523,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
- "learning_rate": 1.93478202307823e-08,
722
- "logits/chosen": 0.8049672842025757,
723
- "logits/rejected": 0.8570533990859985,
724
- "logps/chosen": -105.7918930053711,
725
- "logps/rejected": -87.53938293457031,
726
- "loss": 0.6907,
727
- "rewards/accuracies": 0.3187499940395355,
728
- "rewards/chosen": -0.005085950251668692,
729
- "rewards/margins": 0.0068057505413889885,
730
- "rewards/rejected": -0.011891700327396393,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
- "learning_rate": 3.283947088983663e-09,
736
- "logits/chosen": 0.8561135530471802,
737
- "logits/rejected": 0.8520036935806274,
738
- "logps/chosen": -115.6650390625,
739
- "logps/rejected": -110.88753509521484,
740
- "loss": 0.6903,
741
- "rewards/accuracies": 0.39375001192092896,
742
- "rewards/chosen": -0.006030657794326544,
743
- "rewards/margins": 0.00688832625746727,
744
- "rewards/rejected": -0.012918984517455101,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
- "step": 477,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  "total_flos": 0.0,
751
- "train_loss": 0.6920521804121805,
752
- "train_runtime": 6813.2628,
753
- "train_samples_per_second": 4.486,
754
- "train_steps_per_second": 0.07
755
  }
756
  ],
757
  "logging_steps": 10,
758
- "max_steps": 477,
759
  "num_input_tokens_seen": 0,
760
- "num_train_epochs": 1,
761
  "save_steps": 100,
762
  "total_flos": 0.0,
763
  "train_batch_size": 4,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9968602825745683,
5
  "eval_steps": 100,
6
+ "global_step": 954,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
648
  },
649
  {
650
  "epoch": 0.86,
651
+ "learning_rate": 3.5218566107988872e-06,
652
+ "logits/chosen": 0.8185564279556274,
653
+ "logits/rejected": 0.8151994943618774,
654
+ "logps/chosen": -95.8418960571289,
655
+ "logps/rejected": -80.64591979980469,
656
  "loss": 0.6916,
657
+ "rewards/accuracies": 0.33125001192092896,
658
+ "rewards/chosen": -0.008152286522090435,
659
+ "rewards/margins": 0.003440212458372116,
660
+ "rewards/rejected": -0.011592499911785126,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
+ "learning_rate": 3.437648009023905e-06,
666
+ "logits/chosen": 0.7549771070480347,
667
+ "logits/rejected": 0.8158624768257141,
668
+ "logps/chosen": -95.72089385986328,
669
+ "logps/rejected": -97.96694946289062,
670
+ "loss": 0.6914,
671
+ "rewards/accuracies": 0.34375,
672
+ "rewards/chosen": -0.00923125259578228,
673
+ "rewards/margins": 0.0032367813400924206,
674
+ "rewards/rejected": -0.012468034401535988,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
+ "learning_rate": 3.352182461642929e-06,
680
+ "logits/chosen": 0.862383246421814,
681
+ "logits/rejected": 0.8848444819450378,
682
+ "logps/chosen": -83.51128387451172,
683
+ "logps/rejected": -90.3624038696289,
684
+ "loss": 0.6909,
685
+ "rewards/accuracies": 0.41874998807907104,
686
+ "rewards/chosen": -0.0076131029054522514,
687
+ "rewards/margins": 0.005502562969923019,
688
+ "rewards/rejected": -0.013115664944052696,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
+ "learning_rate": 3.265574537815398e-06,
694
+ "logits/chosen": 0.7784366011619568,
695
+ "logits/rejected": 0.8296224474906921,
696
+ "logps/chosen": -109.9617919921875,
697
+ "logps/rejected": -82.34262084960938,
698
+ "loss": 0.6907,
699
+ "rewards/accuracies": 0.3499999940395355,
700
+ "rewards/chosen": -0.006960420869290829,
701
+ "rewards/margins": 0.007026113569736481,
702
+ "rewards/rejected": -0.013986535370349884,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
+ "learning_rate": 3.177940338091043e-06,
708
+ "logits/chosen": 0.769826352596283,
709
+ "logits/rejected": 0.8450337648391724,
710
+ "logps/chosen": -81.3624496459961,
711
+ "logps/rejected": -72.23616027832031,
712
+ "loss": 0.6903,
713
+ "rewards/accuracies": 0.30000001192092896,
714
+ "rewards/chosen": -0.00711588840931654,
715
+ "rewards/margins": 0.005811682902276516,
716
+ "rewards/rejected": -0.012927571311593056,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
+ "learning_rate": 3.089397338773569e-06,
722
+ "logits/chosen": 0.7976155877113342,
723
+ "logits/rejected": 0.8512013554573059,
724
+ "logps/chosen": -106.02879333496094,
725
+ "logps/rejected": -88.0615234375,
726
+ "loss": 0.6899,
727
+ "rewards/accuracies": 0.32499998807907104,
728
+ "rewards/chosen": -0.007455066777765751,
729
+ "rewards/margins": 0.00965816993266344,
730
+ "rewards/rejected": -0.017113234847784042,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
+ "learning_rate": 3.0000642344401115e-06,
736
+ "logits/chosen": 0.8498729467391968,
737
+ "logits/rejected": 0.8459364771842957,
738
+ "logps/chosen": -116.07499694824219,
739
+ "logps/rejected": -111.56224060058594,
740
+ "loss": 0.6892,
741
+ "rewards/accuracies": 0.375,
742
+ "rewards/chosen": -0.010130222886800766,
743
+ "rewards/margins": 0.009535903111100197,
744
+ "rewards/rejected": -0.019666125997900963,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
+ "learning_rate": 2.9100607788275547e-06,
750
+ "logits/chosen": 0.8643083572387695,
751
+ "logits/rejected": 0.8725606799125671,
752
+ "logps/chosen": -76.1094741821289,
753
+ "logps/rejected": -66.09565734863281,
754
+ "loss": 0.6903,
755
+ "rewards/accuracies": 0.26875001192092896,
756
+ "rewards/chosen": -0.010615186765789986,
757
+ "rewards/margins": 0.004579311236739159,
758
+ "rewards/rejected": -0.015194499865174294,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 1.03,
763
+ "learning_rate": 2.8195076242990124e-06,
764
+ "logits/chosen": 0.8051859736442566,
765
+ "logits/rejected": 0.8318646550178528,
766
+ "logps/chosen": -93.38316345214844,
767
+ "logps/rejected": -100.01469421386719,
768
+ "loss": 0.6903,
769
+ "rewards/accuracies": 0.35624998807907104,
770
+ "rewards/chosen": -0.014997744932770729,
771
+ "rewards/margins": 0.004119081888347864,
772
+ "rewards/rejected": -0.019116824492812157,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 1.05,
777
+ "learning_rate": 2.72852616010567e-06,
778
+ "logits/chosen": 0.8241451978683472,
779
+ "logits/rejected": 0.8089090585708618,
780
+ "logps/chosen": -102.48905181884766,
781
+ "logps/rejected": -102.73881530761719,
782
+ "loss": 0.6898,
783
+ "rewards/accuracies": 0.39375001192092896,
784
+ "rewards/chosen": -0.013014930300414562,
785
+ "rewards/margins": 0.011042198166251183,
786
+ "rewards/rejected": -0.02405713126063347,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 1.05,
791
+ "eval_logits/chosen": 0.7711244225502014,
792
+ "eval_logits/rejected": 0.7902986407279968,
793
+ "eval_logps/chosen": -93.14293670654297,
794
+ "eval_logps/rejected": -96.24901580810547,
795
+ "eval_loss": 0.6899796724319458,
796
+ "eval_rewards/accuracies": 0.335999995470047,
797
+ "eval_rewards/chosen": -0.014155692420899868,
798
+ "eval_rewards/margins": 0.006364365108311176,
799
+ "eval_rewards/rejected": -0.020520057529211044,
800
+ "eval_runtime": 272.796,
801
+ "eval_samples_per_second": 7.331,
802
+ "eval_steps_per_second": 0.458,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 1.07,
807
+ "learning_rate": 2.637238349660819e-06,
808
+ "logits/chosen": 0.8259177207946777,
809
+ "logits/rejected": 0.8255017995834351,
810
+ "logps/chosen": -92.70307159423828,
811
+ "logps/rejected": -90.75215148925781,
812
+ "loss": 0.6893,
813
+ "rewards/accuracies": 0.35624998807907104,
814
+ "rewards/chosen": -0.016096794977784157,
815
+ "rewards/margins": 0.006517867557704449,
816
+ "rewards/rejected": -0.02261466160416603,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 1.09,
821
+ "learning_rate": 2.5457665670441937e-06,
822
+ "logits/chosen": 0.8241807222366333,
823
+ "logits/rejected": 0.8674284219741821,
824
+ "logps/chosen": -103.48587799072266,
825
+ "logps/rejected": -95.00794219970703,
826
+ "loss": 0.6887,
827
+ "rewards/accuracies": 0.375,
828
+ "rewards/chosen": -0.012511089444160461,
829
+ "rewards/margins": 0.011780844070017338,
830
+ "rewards/rejected": -0.024291934445500374,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 1.11,
835
+ "learning_rate": 2.4542334329558075e-06,
836
+ "logits/chosen": 0.8090038299560547,
837
+ "logits/rejected": 0.8909789323806763,
838
+ "logps/chosen": -108.34500885009766,
839
+ "logps/rejected": -90.74520111083984,
840
+ "loss": 0.6897,
841
+ "rewards/accuracies": 0.38749998807907104,
842
+ "rewards/chosen": -0.013912905938923359,
843
+ "rewards/margins": 0.01008325070142746,
844
+ "rewards/rejected": -0.023996157571673393,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 1.13,
849
+ "learning_rate": 2.3627616503391813e-06,
850
+ "logits/chosen": 0.7575622797012329,
851
+ "logits/rejected": 0.7894734144210815,
852
+ "logps/chosen": -103.4853515625,
853
+ "logps/rejected": -89.14204406738281,
854
+ "loss": 0.6886,
855
+ "rewards/accuracies": 0.36250001192092896,
856
+ "rewards/chosen": -0.01601695641875267,
857
+ "rewards/margins": 0.009688997641205788,
858
+ "rewards/rejected": -0.025705954059958458,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 1.15,
863
+ "learning_rate": 2.271473839894331e-06,
864
+ "logits/chosen": 0.7494063377380371,
865
+ "logits/rejected": 0.7767230272293091,
866
+ "logps/chosen": -93.18793487548828,
867
+ "logps/rejected": -85.53865814208984,
868
+ "loss": 0.689,
869
+ "rewards/accuracies": 0.34375,
870
+ "rewards/chosen": -0.015046611428260803,
871
+ "rewards/margins": 0.009437174536287785,
872
+ "rewards/rejected": -0.024483786895871162,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 1.17,
877
+ "learning_rate": 2.1804923757009885e-06,
878
+ "logits/chosen": 0.7773897051811218,
879
+ "logits/rejected": 0.8226049542427063,
880
+ "logps/chosen": -118.14359283447266,
881
+ "logps/rejected": -118.2892837524414,
882
+ "loss": 0.6883,
883
+ "rewards/accuracies": 0.39375001192092896,
884
+ "rewards/chosen": -0.01944814994931221,
885
+ "rewards/margins": 0.012502019293606281,
886
+ "rewards/rejected": -0.03195016831159592,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 1.19,
891
+ "learning_rate": 2.089939221172446e-06,
892
+ "logits/chosen": 0.7639147639274597,
893
+ "logits/rejected": 0.8038470149040222,
894
+ "logps/chosen": -96.55006408691406,
895
+ "logps/rejected": -94.61703491210938,
896
+ "loss": 0.6893,
897
+ "rewards/accuracies": 0.36250001192092896,
898
+ "rewards/chosen": -0.01976608671247959,
899
+ "rewards/margins": 0.01228870265185833,
900
+ "rewards/rejected": -0.03205478936433792,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 1.21,
905
+ "learning_rate": 1.9999357655598894e-06,
906
+ "logits/chosen": 0.799372673034668,
907
+ "logits/rejected": 0.7758508920669556,
908
+ "logps/chosen": -107.0300521850586,
909
+ "logps/rejected": -101.98541259765625,
910
+ "loss": 0.6898,
911
+ "rewards/accuracies": 0.32499998807907104,
912
+ "rewards/chosen": -0.023156706243753433,
913
+ "rewards/margins": 0.0033106685150414705,
914
+ "rewards/rejected": -0.026467373594641685,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 1.23,
919
+ "learning_rate": 1.9106026612264316e-06,
920
+ "logits/chosen": 0.8342978358268738,
921
+ "logits/rejected": 0.818748950958252,
922
+ "logps/chosen": -82.849609375,
923
+ "logps/rejected": -82.92151641845703,
924
+ "loss": 0.6875,
925
+ "rewards/accuracies": 0.34375,
926
+ "rewards/chosen": -0.021987412124872208,
927
+ "rewards/margins": 0.00925590842962265,
928
+ "rewards/rejected": -0.03124331869184971,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 1.26,
933
+ "learning_rate": 1.8220596619089576e-06,
934
+ "logits/chosen": 0.7792515158653259,
935
+ "logits/rejected": 0.7631284594535828,
936
+ "logps/chosen": -108.453369140625,
937
+ "logps/rejected": -109.84110260009766,
938
+ "loss": 0.6882,
939
+ "rewards/accuracies": 0.3812499940395355,
940
+ "rewards/chosen": -0.024591121822595596,
941
+ "rewards/margins": 0.010857349261641502,
942
+ "rewards/rejected": -0.03544846922159195,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 1.26,
947
+ "eval_logits/chosen": 0.7526758909225464,
948
+ "eval_logits/rejected": 0.7722000479698181,
949
+ "eval_logps/chosen": -93.8980941772461,
950
+ "eval_logps/rejected": -97.25941467285156,
951
+ "eval_loss": 0.6886565089225769,
952
+ "eval_rewards/accuracies": 0.33399999141693115,
953
+ "eval_rewards/chosen": -0.02170729637145996,
954
+ "eval_rewards/margins": 0.008916829712688923,
955
+ "eval_rewards/rejected": -0.03062412701547146,
956
+ "eval_runtime": 272.575,
957
+ "eval_samples_per_second": 7.337,
958
+ "eval_steps_per_second": 0.459,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 1.28,
963
+ "learning_rate": 1.7344254621846018e-06,
964
+ "logits/chosen": 0.7705439329147339,
965
+ "logits/rejected": 0.7898679375648499,
966
+ "logps/chosen": -114.97818756103516,
967
+ "logps/rejected": -105.07425689697266,
968
+ "loss": 0.6871,
969
+ "rewards/accuracies": 0.35624998807907104,
970
+ "rewards/chosen": -0.020355116575956345,
971
+ "rewards/margins": 0.01100640743970871,
972
+ "rewards/rejected": -0.031361524015665054,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 1.3,
977
+ "learning_rate": 1.647817538357072e-06,
978
+ "logits/chosen": 0.8015000224113464,
979
+ "logits/rejected": 0.8592731356620789,
980
+ "logps/chosen": -105.63825988769531,
981
+ "logps/rejected": -101.30821228027344,
982
+ "loss": 0.6877,
983
+ "rewards/accuracies": 0.38749998807907104,
984
+ "rewards/chosen": -0.025122905150055885,
985
+ "rewards/margins": 0.012796345166862011,
986
+ "rewards/rejected": -0.03791924566030502,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 1.32,
991
+ "learning_rate": 1.5623519909760953e-06,
992
+ "logits/chosen": 0.72571861743927,
993
+ "logits/rejected": 0.7672609090805054,
994
+ "logps/chosen": -95.39134216308594,
995
+ "logps/rejected": -91.43064880371094,
996
+ "loss": 0.6875,
997
+ "rewards/accuracies": 0.3125,
998
+ "rewards/chosen": -0.023522889241576195,
999
+ "rewards/margins": 0.008042030036449432,
1000
+ "rewards/rejected": -0.031564921140670776,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 1.34,
1005
+ "learning_rate": 1.4781433892011132e-06,
1006
+ "logits/chosen": 0.7983066439628601,
1007
+ "logits/rejected": 0.828285813331604,
1008
+ "logps/chosen": -91.58604431152344,
1009
+ "logps/rejected": -98.34529876708984,
1010
+ "loss": 0.6877,
1011
+ "rewards/accuracies": 0.3375000059604645,
1012
+ "rewards/chosen": -0.02216607704758644,
1013
+ "rewards/margins": 0.009402362629771233,
1014
+ "rewards/rejected": -0.03156844526529312,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 1.36,
1019
+ "learning_rate": 1.3953046172178413e-06,
1020
+ "logits/chosen": 0.8052563667297363,
1021
+ "logits/rejected": 0.861487090587616,
1022
+ "logps/chosen": -102.22875213623047,
1023
+ "logps/rejected": -115.90311431884766,
1024
+ "loss": 0.688,
1025
+ "rewards/accuracies": 0.39375001192092896,
1026
+ "rewards/chosen": -0.02779082953929901,
1027
+ "rewards/margins": 0.011276346631348133,
1028
+ "rewards/rejected": -0.03906718268990517,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 1.38,
1033
+ "learning_rate": 1.3139467229135999e-06,
1034
+ "logits/chosen": 0.7473502159118652,
1035
+ "logits/rejected": 0.7619670629501343,
1036
+ "logps/chosen": -81.72496032714844,
1037
+ "logps/rejected": -88.32991027832031,
1038
+ "loss": 0.6867,
1039
+ "rewards/accuracies": 0.32499998807907104,
1040
+ "rewards/chosen": -0.023514125496149063,
1041
+ "rewards/margins": 0.01081976480782032,
1042
+ "rewards/rejected": -0.034333888441324234,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 1.4,
1047
+ "learning_rate": 1.2341787690142436e-06,
1048
+ "logits/chosen": 0.677151083946228,
1049
+ "logits/rejected": 0.7038652300834656,
1050
+ "logps/chosen": -116.34004211425781,
1051
+ "logps/rejected": -98.69480895996094,
1052
+ "loss": 0.6883,
1053
+ "rewards/accuracies": 0.3687500059604645,
1054
+ "rewards/chosen": -0.024621013551950455,
1055
+ "rewards/margins": 0.016095632687211037,
1056
+ "rewards/rejected": -0.040716640651226044,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 1.42,
1061
+ "learning_rate": 1.1561076868822756e-06,
1062
+ "logits/chosen": 0.7057438492774963,
1063
+ "logits/rejected": 0.734573483467102,
1064
+ "logps/chosen": -77.70423889160156,
1065
+ "logps/rejected": -83.44449615478516,
1066
+ "loss": 0.6875,
1067
+ "rewards/accuracies": 0.32499998807907104,
1068
+ "rewards/chosen": -0.029654154554009438,
1069
+ "rewards/margins": 0.00593325262889266,
1070
+ "rewards/rejected": -0.035587407648563385,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 1.44,
1075
+ "learning_rate": 1.079838133172111e-06,
1076
+ "logits/chosen": 0.7637497186660767,
1077
+ "logits/rejected": 0.8190714716911316,
1078
+ "logps/chosen": -95.7881088256836,
1079
+ "logps/rejected": -82.04200744628906,
1080
+ "loss": 0.6882,
1081
+ "rewards/accuracies": 0.35624998807907104,
1082
+ "rewards/chosen": -0.026545196771621704,
1083
+ "rewards/margins": 0.014608954079449177,
1084
+ "rewards/rejected": -0.041154149919748306,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 1.47,
1089
+ "learning_rate": 1.0054723495346484e-06,
1090
+ "logits/chosen": 0.8055804371833801,
1091
+ "logits/rejected": 0.8074037432670593,
1092
+ "logps/chosen": -106.52473449707031,
1093
+ "logps/rejected": -99.84661865234375,
1094
+ "loss": 0.6858,
1095
+ "rewards/accuracies": 0.375,
1096
+ "rewards/chosen": -0.03077777661383152,
1097
+ "rewards/margins": 0.016430744901299477,
1098
+ "rewards/rejected": -0.0472085177898407,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 1.47,
1103
+ "eval_logits/chosen": 0.7395281791687012,
1104
+ "eval_logits/rejected": 0.7599726319313049,
1105
+ "eval_logps/chosen": -94.4716567993164,
1106
+ "eval_logps/rejected": -98.02493286132812,
1107
+ "eval_loss": 0.6878523826599121,
1108
+ "eval_rewards/accuracies": 0.328000009059906,
1109
+ "eval_rewards/chosen": -0.02744293212890625,
1110
+ "eval_rewards/margins": 0.010836233384907246,
1111
+ "eval_rewards/rejected": -0.03827916085720062,
1112
+ "eval_runtime": 272.6941,
1113
+ "eval_samples_per_second": 7.334,
1114
+ "eval_steps_per_second": 0.458,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 1.49,
1119
+ "learning_rate": 9.331100255592437e-07,
1120
+ "logits/chosen": 0.7606201767921448,
1121
+ "logits/rejected": 0.8386918306350708,
1122
+ "logps/chosen": -89.20939636230469,
1123
+ "logps/rejected": -82.81990814208984,
1124
+ "loss": 0.6861,
1125
+ "rewards/accuracies": 0.3062500059604645,
1126
+ "rewards/chosen": -0.022489020600914955,
1127
+ "rewards/margins": 0.012643699534237385,
1128
+ "rewards/rejected": -0.03513271361589432,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 1.51,
1133
+ "learning_rate": 8.628481651367876e-07,
1134
+ "logits/chosen": 0.771535336971283,
1135
+ "logits/rejected": 0.8548757433891296,
1136
+ "logps/chosen": -113.86335754394531,
1137
+ "logps/rejected": -101.86293029785156,
1138
+ "loss": 0.6856,
1139
+ "rewards/accuracies": 0.35624998807907104,
1140
+ "rewards/chosen": -0.02375701069831848,
1141
+ "rewards/margins": 0.01785343512892723,
1142
+ "rewards/rejected": -0.041610442101955414,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 1.53,
1147
+ "learning_rate": 7.947809564230446e-07,
1148
+ "logits/chosen": 0.7982994914054871,
1149
+ "logits/rejected": 0.8568431735038757,
1150
+ "logps/chosen": -98.13008880615234,
1151
+ "logps/rejected": -97.87364196777344,
1152
+ "loss": 0.6874,
1153
+ "rewards/accuracies": 0.375,
1154
+ "rewards/chosen": -0.03274186700582504,
1155
+ "rewards/margins": 0.010785898193717003,
1156
+ "rewards/rejected": -0.0435277596116066,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 1.55,
1161
+ "learning_rate": 7.289996455765749e-07,
1162
+ "logits/chosen": 0.7194357514381409,
1163
+ "logits/rejected": 0.7669156789779663,
1164
+ "logps/chosen": -88.61851501464844,
1165
+ "logps/rejected": -82.62269592285156,
1166
+ "loss": 0.6889,
1167
+ "rewards/accuracies": 0.2562499940395355,
1168
+ "rewards/chosen": -0.02295432984828949,
1169
+ "rewards/margins": 0.008415495045483112,
1170
+ "rewards/rejected": -0.03136982396245003,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 1.57,
1175
+ "learning_rate": 6.655924144404907e-07,
1176
+ "logits/chosen": 0.7268679141998291,
1177
+ "logits/rejected": 0.7419520616531372,
1178
+ "logps/chosen": -90.82257843017578,
1179
+ "logps/rejected": -87.84976196289062,
1180
+ "loss": 0.687,
1181
+ "rewards/accuracies": 0.35624998807907104,
1182
+ "rewards/chosen": -0.024226779118180275,
1183
+ "rewards/margins": 0.014533626846969128,
1184
+ "rewards/rejected": -0.03876040503382683,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 1.59,
1189
+ "learning_rate": 6.046442623320145e-07,
1190
+ "logits/chosen": 0.775734543800354,
1191
+ "logits/rejected": 0.8094171285629272,
1192
+ "logps/chosen": -111.5390853881836,
1193
+ "logps/rejected": -107.2262954711914,
1194
+ "loss": 0.6868,
1195
+ "rewards/accuracies": 0.36250001192092896,
1196
+ "rewards/chosen": -0.035627130419015884,
1197
+ "rewards/margins": 0.011030396446585655,
1198
+ "rewards/rejected": -0.04665752500295639,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 1.61,
1203
+ "learning_rate": 5.462368920983249e-07,
1204
+ "logits/chosen": 0.7696752548217773,
1205
+ "logits/rejected": 0.7860345840454102,
1206
+ "logps/chosen": -86.94595336914062,
1207
+ "logps/rejected": -74.9552230834961,
1208
+ "loss": 0.6867,
1209
+ "rewards/accuracies": 0.3125,
1210
+ "rewards/chosen": -0.02532133087515831,
1211
+ "rewards/margins": 0.010699031874537468,
1212
+ "rewards/rejected": -0.03602036461234093,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 1.63,
1217
+ "learning_rate": 4.904486005914027e-07,
1218
+ "logits/chosen": 0.7500615119934082,
1219
+ "logits/rejected": 0.7661712765693665,
1220
+ "logps/chosen": -119.89558410644531,
1221
+ "logps/rejected": -111.26094055175781,
1222
+ "loss": 0.6851,
1223
+ "rewards/accuracies": 0.3812499940395355,
1224
+ "rewards/chosen": -0.02393215522170067,
1225
+ "rewards/margins": 0.01878645084798336,
1226
+ "rewards/rejected": -0.04271860793232918,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 1.65,
1231
+ "learning_rate": 4.373541737087264e-07,
1232
+ "logits/chosen": 0.7226775288581848,
1233
+ "logits/rejected": 0.7507297992706299,
1234
+ "logps/chosen": -103.21110534667969,
1235
+ "logps/rejected": -99.74224853515625,
1236
+ "loss": 0.6876,
1237
+ "rewards/accuracies": 0.29374998807907104,
1238
+ "rewards/chosen": -0.028694171458482742,
1239
+ "rewards/margins": 0.013696588575839996,
1240
+ "rewards/rejected": -0.04239075630903244,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 1.67,
1245
+ "learning_rate": 3.8702478614051353e-07,
1246
+ "logits/chosen": 0.7573332786560059,
1247
+ "logits/rejected": 0.7167325019836426,
1248
+ "logps/chosen": -101.85111236572266,
1249
+ "logps/rejected": -95.10759735107422,
1250
+ "loss": 0.6857,
1251
+ "rewards/accuracies": 0.4124999940395355,
1252
+ "rewards/chosen": -0.02707928977906704,
1253
+ "rewards/margins": 0.022238872945308685,
1254
+ "rewards/rejected": -0.049318164587020874,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 1.67,
1259
+ "eval_logits/chosen": 0.7340908646583557,
1260
+ "eval_logits/rejected": 0.7547872066497803,
1261
+ "eval_logps/chosen": -94.76182556152344,
1262
+ "eval_logps/rejected": -98.42695617675781,
1263
+ "eval_loss": 0.687368631362915,
1264
+ "eval_rewards/accuracies": 0.33399999141693115,
1265
+ "eval_rewards/chosen": -0.030344627797603607,
1266
+ "eval_rewards/margins": 0.011954776011407375,
1267
+ "eval_rewards/rejected": -0.04229940101504326,
1268
+ "eval_runtime": 272.7101,
1269
+ "eval_samples_per_second": 7.334,
1270
+ "eval_steps_per_second": 0.458,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 1.7,
1275
+ "learning_rate": 3.3952790595787986e-07,
1276
+ "logits/chosen": 0.8084260821342468,
1277
+ "logits/rejected": 0.8206745982170105,
1278
+ "logps/chosen": -118.43504333496094,
1279
+ "logps/rejected": -105.46468353271484,
1280
+ "loss": 0.6876,
1281
+ "rewards/accuracies": 0.3187499940395355,
1282
+ "rewards/chosen": -0.03346724063158035,
1283
+ "rewards/margins": 0.010504155419766903,
1284
+ "rewards/rejected": -0.04397139698266983,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 1.72,
1289
+ "learning_rate": 2.9492720416985004e-07,
1290
+ "logits/chosen": 0.6800391674041748,
1291
+ "logits/rejected": 0.6965945363044739,
1292
+ "logps/chosen": -96.83241271972656,
1293
+ "logps/rejected": -101.45925903320312,
1294
+ "loss": 0.6869,
1295
+ "rewards/accuracies": 0.3499999940395355,
1296
+ "rewards/chosen": -0.027126986533403397,
1297
+ "rewards/margins": 0.01599222794175148,
1298
+ "rewards/rejected": -0.04311921447515488,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 1.74,
1303
+ "learning_rate": 2.5328246937043526e-07,
1304
+ "logits/chosen": 0.7581857442855835,
1305
+ "logits/rejected": 0.7956913113594055,
1306
+ "logps/chosen": -98.04803466796875,
1307
+ "logps/rejected": -102.8603515625,
1308
+ "loss": 0.6858,
1309
+ "rewards/accuracies": 0.38749998807907104,
1310
+ "rewards/chosen": -0.034231387078762054,
1311
+ "rewards/margins": 0.015285378322005272,
1312
+ "rewards/rejected": -0.049516770988702774,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 1.76,
1317
+ "learning_rate": 2.1464952759020857e-07,
1318
+ "logits/chosen": 0.6740394830703735,
1319
+ "logits/rejected": 0.7218228578567505,
1320
+ "logps/chosen": -96.54594421386719,
1321
+ "logps/rejected": -99.02603912353516,
1322
+ "loss": 0.6878,
1323
+ "rewards/accuracies": 0.30000001192092896,
1324
+ "rewards/chosen": -0.029307430610060692,
1325
+ "rewards/margins": 0.010790064930915833,
1326
+ "rewards/rejected": -0.040097493678331375,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 1.78,
1331
+ "learning_rate": 1.790801674598186e-07,
1332
+ "logits/chosen": 0.7388015985488892,
1333
+ "logits/rejected": 0.7736684083938599,
1334
+ "logps/chosen": -126.44425201416016,
1335
+ "logps/rejected": -102.01686096191406,
1336
+ "loss": 0.686,
1337
+ "rewards/accuracies": 0.39375001192092896,
1338
+ "rewards/chosen": -0.030984923243522644,
1339
+ "rewards/margins": 0.01829494535923004,
1340
+ "rewards/rejected": -0.049279868602752686,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 1.8,
1345
+ "learning_rate": 1.4662207078575685e-07,
1346
+ "logits/chosen": 0.7661498785018921,
1347
+ "logits/rejected": 0.7773095965385437,
1348
+ "logps/chosen": -92.13005065917969,
1349
+ "logps/rejected": -91.37489318847656,
1350
+ "loss": 0.6844,
1351
+ "rewards/accuracies": 0.33125001192092896,
1352
+ "rewards/chosen": -0.028016680851578712,
1353
+ "rewards/margins": 0.021260341629385948,
1354
+ "rewards/rejected": -0.04927702248096466,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 1.82,
1359
+ "learning_rate": 1.1731874863145143e-07,
1360
+ "logits/chosen": 0.7595205903053284,
1361
+ "logits/rejected": 0.8001340627670288,
1362
+ "logps/chosen": -94.4384765625,
1363
+ "logps/rejected": -92.37102508544922,
1364
+ "loss": 0.687,
1365
+ "rewards/accuracies": 0.3812499940395355,
1366
+ "rewards/chosen": -0.026580199599266052,
1367
+ "rewards/margins": 0.01631156913936138,
1368
+ "rewards/rejected": -0.042891766875982285,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 1.84,
1373
+ "learning_rate": 9.120948298936422e-08,
1374
+ "logits/chosen": 0.7684468030929565,
1375
+ "logits/rejected": 0.8293699026107788,
1376
+ "logps/chosen": -89.8713150024414,
1377
+ "logps/rejected": -87.5567855834961,
1378
+ "loss": 0.6844,
1379
+ "rewards/accuracies": 0.34375,
1380
+ "rewards/chosen": -0.026115071028470993,
1381
+ "rewards/margins": 0.01522884052246809,
1382
+ "rewards/rejected": -0.04134391248226166,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 1.86,
1387
+ "learning_rate": 6.832927412229017e-08,
1388
+ "logits/chosen": 0.7296298742294312,
1389
+ "logits/rejected": 0.769476056098938,
1390
+ "logps/chosen": -93.165283203125,
1391
+ "logps/rejected": -96.9769287109375,
1392
+ "loss": 0.6851,
1393
+ "rewards/accuracies": 0.34375,
1394
+ "rewards/chosen": -0.029177119955420494,
1395
+ "rewards/margins": 0.016743745654821396,
1396
+ "rewards/rejected": -0.04592086747288704,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 1.88,
1401
+ "learning_rate": 4.870879364444109e-08,
1402
+ "logits/chosen": 0.7372065782546997,
1403
+ "logits/rejected": 0.7822612524032593,
1404
+ "logps/chosen": -89.78585052490234,
1405
+ "logps/rejected": -89.9983901977539,
1406
+ "loss": 0.6866,
1407
+ "rewards/accuracies": 0.3499999940395355,
1408
+ "rewards/chosen": -0.028756320476531982,
1409
+ "rewards/margins": 0.017704127356410027,
1410
+ "rewards/rejected": -0.04646044969558716,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 1.88,
1415
+ "eval_logits/chosen": 0.732122004032135,
1416
+ "eval_logits/rejected": 0.7528373599052429,
1417
+ "eval_logps/chosen": -94.85501861572266,
1418
+ "eval_logps/rejected": -98.56551361083984,
1419
+ "eval_loss": 0.6872289776802063,
1420
+ "eval_rewards/accuracies": 0.34200000762939453,
1421
+ "eval_rewards/chosen": -0.03127633407711983,
1422
+ "eval_rewards/margins": 0.012408819980919361,
1423
+ "eval_rewards/rejected": -0.043685153126716614,
1424
+ "eval_runtime": 272.6053,
1425
+ "eval_samples_per_second": 7.337,
1426
+ "eval_steps_per_second": 0.459,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 1.9,
1431
+ "learning_rate": 3.237434340521789e-08,
1432
+ "logits/chosen": 0.710226833820343,
1433
+ "logits/rejected": 0.7852008938789368,
1434
+ "logps/chosen": -103.32108306884766,
1435
+ "logps/rejected": -105.41300964355469,
1436
+ "loss": 0.6851,
1437
+ "rewards/accuracies": 0.35624998807907104,
1438
+ "rewards/chosen": -0.030542368069291115,
1439
+ "rewards/margins": 0.015573601238429546,
1440
+ "rewards/rejected": -0.04611596092581749,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 1.93,
1445
+ "learning_rate": 1.93478202307823e-08,
1446
+ "logits/chosen": 0.7026702165603638,
1447
+ "logits/rejected": 0.7310872673988342,
1448
+ "logps/chosen": -98.82904815673828,
1449
+ "logps/rejected": -96.06184387207031,
1450
+ "loss": 0.6866,
1451
+ "rewards/accuracies": 0.33125001192092896,
1452
+ "rewards/chosen": -0.02804415300488472,
1453
+ "rewards/margins": 0.011173558421432972,
1454
+ "rewards/rejected": -0.03921770304441452,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 1.95,
1459
+ "learning_rate": 9.646686570697062e-09,
1460
+ "logits/chosen": 0.7164516448974609,
1461
+ "logits/rejected": 0.8162258863449097,
1462
+ "logps/chosen": -104.56522369384766,
1463
+ "logps/rejected": -101.3372802734375,
1464
+ "loss": 0.6871,
1465
+ "rewards/accuracies": 0.3812499940395355,
1466
+ "rewards/chosen": -0.024641428142786026,
1467
+ "rewards/margins": 0.01806877739727497,
1468
+ "rewards/rejected": -0.042710207402706146,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 1.97,
1473
+ "learning_rate": 3.283947088983663e-09,
1474
+ "logits/chosen": 0.7736842632293701,
1475
+ "logits/rejected": 0.7780998945236206,
1476
+ "logps/chosen": -100.5300064086914,
1477
+ "logps/rejected": -111.0368881225586,
1478
+ "loss": 0.6875,
1479
+ "rewards/accuracies": 0.38749998807907104,
1480
+ "rewards/chosen": -0.0371762290596962,
1481
+ "rewards/margins": 0.015147706493735313,
1482
+ "rewards/rejected": -0.05232393741607666,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 1.99,
1487
+ "learning_rate": 2.681312309735229e-10,
1488
+ "logits/chosen": 0.8036483526229858,
1489
+ "logits/rejected": 0.8175506591796875,
1490
+ "logps/chosen": -94.41046905517578,
1491
+ "logps/rejected": -104.76774597167969,
1492
+ "loss": 0.6875,
1493
+ "rewards/accuracies": 0.3062500059604645,
1494
+ "rewards/chosen": -0.03001193329691887,
1495
+ "rewards/margins": 0.011342789977788925,
1496
+ "rewards/rejected": -0.04135472699999809,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 2.0,
1501
+ "step": 954,
1502
  "total_flos": 0.0,
1503
+ "train_loss": 0.3994182998029441,
1504
+ "train_runtime": 8026.3391,
1505
+ "train_samples_per_second": 7.617,
1506
+ "train_steps_per_second": 0.119
1507
  }
1508
  ],
1509
  "logging_steps": 10,
1510
+ "max_steps": 954,
1511
  "num_input_tokens_seen": 0,
1512
+ "num_train_epochs": 2,
1513
  "save_steps": 100,
1514
  "total_flos": 0.0,
1515
  "train_batch_size": 4,