fenguhao commited on
Commit
54e9029
·
verified ·
1 Parent(s): 00ceaf5

Model save

Browse files
README.md CHANGED
@@ -17,15 +17,15 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.7774
21
- - Rewards/chosen: -2.3054
22
- - Rewards/rejected: -3.6762
23
- - Rewards/accuracies: 0.7857
24
- - Rewards/margins: 1.3708
25
- - Logps/rejected: -629.3941
26
- - Logps/chosen: -514.6497
27
- - Logits/rejected: 3.0974
28
- - Logits/chosen: 1.8746
29
 
30
  ## Model description
31
 
@@ -45,14 +45,14 @@ More information needed
45
 
46
  The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
- - train_batch_size: 8
49
- - eval_batch_size: 8
50
  - seed: 42
51
  - distributed_type: multi-GPU
52
- - num_devices: 4
53
- - gradient_accumulation_steps: 2
54
- - total_train_batch_size: 64
55
- - total_eval_batch_size: 32
56
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
57
  - lr_scheduler_type: cosine
58
  - lr_scheduler_warmup_ratio: 0.1
@@ -62,15 +62,11 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
- | 1.0407 | 0.1 | 100 | 1.0319 | -0.3679 | -0.5944 | 0.6845 | 0.2265 | -321.2206 | -320.9028 | -2.4869 | -2.5233 |
66
- | 0.8874 | 0.21 | 200 | 0.8947 | -0.8206 | -1.5352 | 0.7480 | 0.7146 | -415.2939 | -366.1670 | -0.4438 | -0.9008 |
67
- | 0.8068 | 0.31 | 300 | 0.8382 | -1.4666 | -2.4781 | 0.7540 | 1.0115 | -509.5933 | -430.7722 | 1.4832 | 0.8117 |
68
- | 0.7845 | 0.42 | 400 | 0.8209 | -1.7788 | -2.9187 | 0.7520 | 1.1399 | -553.6510 | -461.9887 | 2.8755 | 2.1264 |
69
- | 0.8323 | 0.52 | 500 | 0.8332 | -1.4352 | -2.5462 | 0.7440 | 1.1110 | -516.3953 | -427.6284 | 2.1308 | 1.3306 |
70
- | 0.7677 | 0.63 | 600 | 0.7981 | -2.1915 | -3.5501 | 0.7520 | 1.3586 | -616.7921 | -503.2610 | 3.3207 | 1.8966 |
71
- | 0.7227 | 0.73 | 700 | 0.7834 | -2.2316 | -3.6191 | 0.7639 | 1.3876 | -623.6929 | -507.2672 | 2.9762 | 1.7450 |
72
- | 0.7455 | 0.84 | 800 | 0.7792 | -2.4217 | -3.8218 | 0.7758 | 1.4001 | -643.9626 | -526.2844 | 3.2439 | 1.9906 |
73
- | 0.7785 | 0.94 | 900 | 0.7779 | -2.2974 | -3.6650 | 0.7798 | 1.3675 | -628.2753 | -513.8542 | 3.0941 | 1.8722 |
74
 
75
 
76
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.5046
21
+ - Rewards/chosen: -1.1826
22
+ - Rewards/rejected: -2.0581
23
+ - Rewards/accuracies: 0.7246
24
+ - Rewards/margins: 0.8756
25
+ - Logps/rejected: -470.5493
26
+ - Logps/chosen: -395.9858
27
+ - Logits/rejected: 0.0457
28
+ - Logits/chosen: -0.4473
29
 
30
  ## Model description
31
 
 
45
 
46
  The following hyperparameters were used during training:
47
  - learning_rate: 5e-07
48
+ - train_batch_size: 4
49
+ - eval_batch_size: 4
50
  - seed: 42
51
  - distributed_type: multi-GPU
52
+ - num_devices: 3
53
+ - gradient_accumulation_steps: 10
54
+ - total_train_batch_size: 120
55
+ - total_eval_batch_size: 12
56
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
57
  - lr_scheduler_type: cosine
58
  - lr_scheduler_warmup_ratio: 0.1
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.5764 | 0.2 | 100 | 0.5829 | -0.3592 | -0.7613 | 0.6931 | 0.4020 | -340.8605 | -313.6503 | -2.4360 | -2.4791 |
66
+ | 0.5169 | 0.39 | 200 | 0.5312 | -0.8847 | -1.6204 | 0.7066 | 0.7356 | -426.7720 | -366.2012 | -0.8443 | -1.2010 |
67
+ | 0.5133 | 0.59 | 300 | 0.5159 | -1.1886 | -1.9604 | 0.7246 | 0.7718 | -460.7765 | -396.5906 | 0.0460 | -0.3853 |
68
+ | 0.4968 | 0.79 | 400 | 0.5058 | -1.2445 | -2.1063 | 0.7141 | 0.8618 | -475.3639 | -402.1766 | 0.2014 | -0.2552 |
69
+ | 0.4833 | 0.98 | 500 | 0.5045 | -1.1821 | -2.0581 | 0.7260 | 0.8760 | -470.5448 | -395.9374 | 0.0436 | -0.4496 |
 
 
 
 
70
 
71
 
72
  ### Framework versions
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": 1.8746349811553955,
4
- "eval_logits/rejected": 3.097362756729126,
5
- "eval_logps/chosen": -514.6497192382812,
6
- "eval_logps/rejected": -629.3941040039062,
7
- "eval_loss": 0.7774083614349365,
8
- "eval_rewards/accuracies": 0.7857142686843872,
9
- "eval_rewards/chosen": -2.305399179458618,
10
- "eval_rewards/margins": 1.3707566261291504,
11
- "eval_rewards/rejected": -3.6761555671691895,
12
- "eval_runtime": 244.4926,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 8.18,
15
- "eval_steps_per_second": 0.258,
16
- "train_loss": 0.8416462149295507,
17
- "train_runtime": 20662.0179,
18
  "train_samples": 61135,
19
- "train_samples_per_second": 2.959,
20
- "train_steps_per_second": 0.046
21
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": -0.44726553559303284,
4
+ "eval_logits/rejected": 0.045745834708213806,
5
+ "eval_logps/chosen": -395.98583984375,
6
+ "eval_logps/rejected": -470.54931640625,
7
+ "eval_loss": 0.5046471357345581,
8
+ "eval_rewards/accuracies": 0.7245509028434753,
9
+ "eval_rewards/chosen": -1.1825801134109497,
10
+ "eval_rewards/margins": 0.8755642771720886,
11
+ "eval_rewards/rejected": -2.0581440925598145,
12
+ "eval_runtime": 494.7185,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.043,
15
+ "eval_steps_per_second": 0.338,
16
+ "train_loss": 0.5401819272219315,
17
+ "train_runtime": 34352.758,
18
  "train_samples": 61135,
19
+ "train_samples_per_second": 1.78,
20
+ "train_steps_per_second": 0.015
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": 1.8746349811553955,
4
- "eval_logits/rejected": 3.097362756729126,
5
- "eval_logps/chosen": -514.6497192382812,
6
- "eval_logps/rejected": -629.3941040039062,
7
- "eval_loss": 0.7774083614349365,
8
- "eval_rewards/accuracies": 0.7857142686843872,
9
- "eval_rewards/chosen": -2.305399179458618,
10
- "eval_rewards/margins": 1.3707566261291504,
11
- "eval_rewards/rejected": -3.6761555671691895,
12
- "eval_runtime": 244.4926,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 8.18,
15
- "eval_steps_per_second": 0.258
16
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": -0.44726553559303284,
4
+ "eval_logits/rejected": 0.045745834708213806,
5
+ "eval_logps/chosen": -395.98583984375,
6
+ "eval_logps/rejected": -470.54931640625,
7
+ "eval_loss": 0.5046471357345581,
8
+ "eval_rewards/accuracies": 0.7245509028434753,
9
+ "eval_rewards/chosen": -1.1825801134109497,
10
+ "eval_rewards/margins": 0.8755642771720886,
11
+ "eval_rewards/rejected": -2.0581440925598145,
12
+ "eval_runtime": 494.7185,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.043,
15
+ "eval_steps_per_second": 0.338
16
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25fd3bf4215bbd131ceb23b460f9d81e046f3c7f8036538a66eec9cf5df4d133
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bddcb477d44b03d21f482dfee881e87dc11347525cffd01a54e6bbe7b24b6083
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16fa4e673a137503a5ffd3f374fdfbeeaa6937c1a117948d3eb7a4b379896b40
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5751b4a0c756ee1e97af6b65bce15ff65004fd334bd8f9068769e14fc685e3c
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88d174023efd91b4ab079ed43a18c1fc247282d94b9776c3e5cd126b9c4cef87
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f5e849324cd1452cd76c40ab2420e8932e50bd8cdf6876faad637044d35237
3
  size 4540516344
runs/Oct26_07-09-09_RLHF000/events.out.tfevents.1729926715.RLHF000.205082.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8198256bf241db9ffcc9aaa0a30c95c52049316afc0c6cd7d6050d3ef7a723a9
3
+ size 4505
runs/Oct26_07-40-14_RLHF000/events.out.tfevents.1729928440.RLHF000.213828.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5a4bcd2eb605dbeb99a9139d4de439c91723d61ae9cbe5cccc5bf5943a3310c
3
+ size 5128
runs/Oct26_07-43-10_RLHF000/events.out.tfevents.1729928610.RLHF000.215421.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:372271a3da8d1fdb5d70213071193f32bbcf69751d7b5e75e6fcc0c0448e58b2
3
+ size 4506
runs/Oct26_07-44-28_RLHF000/events.out.tfevents.1729928694.RLHF000.216128.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:162190b88bd4e01f86146512d2834e576a7ae2ad1464214e871175d6b9e3160c
3
+ size 5129
runs/Oct26_08-01-52_RLHF000/events.out.tfevents.1729929739.RLHF000.222017.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35834f9ed8e1e2f807d7382117ff2319a1f7373bf4a143e41f0b9085b39e509e
3
+ size 40739
runs/Oct26_08-01-52_RLHF000/events.out.tfevents.1729964586.RLHF000.222017.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8004aa31687b57a78dabf8077b9977829e8ec42aaac1dd1acb1e47d9479ed9e5
3
+ size 828
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.8416462149295507,
4
- "train_runtime": 20662.0179,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 2.959,
7
- "train_steps_per_second": 0.046
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.5401819272219315,
4
+ "train_runtime": 34352.758,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 1.78,
7
+ "train_steps_per_second": 0.015
8
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9994767137624281,
5
  "eval_steps": 100,
6
- "global_step": 955,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 5.208333333333333e-09,
14
- "logits/chosen": -2.919764995574951,
15
- "logits/rejected": -2.686896800994873,
16
- "logps/chosen": -229.94229125976562,
17
- "logps/rejected": -214.70114135742188,
18
- "loss": 1.1369,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,1496 +23,802 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.01,
27
- "learning_rate": 5.208333333333333e-08,
28
- "logits/chosen": -2.680727005004883,
29
- "logits/rejected": -2.7090559005737305,
30
- "logps/chosen": -295.7759094238281,
31
- "logps/rejected": -250.66514587402344,
32
- "loss": 1.1367,
33
- "rewards/accuracies": 0.5208333134651184,
34
- "rewards/chosen": 0.0007360066520050168,
35
- "rewards/margins": 0.0012527304934337735,
36
- "rewards/rejected": -0.0005167239578440785,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.02,
41
- "learning_rate": 1.0416666666666667e-07,
42
- "logits/chosen": -2.6195316314697266,
43
- "logits/rejected": -2.625615358352661,
44
- "logps/chosen": -271.3199462890625,
45
- "logps/rejected": -246.9070587158203,
46
- "loss": 1.1366,
47
- "rewards/accuracies": 0.543749988079071,
48
- "rewards/chosen": 0.00032243202440440655,
49
- "rewards/margins": 0.00047141723916865885,
50
- "rewards/rejected": -0.0001489851565565914,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.03,
55
- "learning_rate": 1.5624999999999999e-07,
56
- "logits/chosen": -2.7032415866851807,
57
- "logits/rejected": -2.6663870811462402,
58
- "logps/chosen": -278.2927551269531,
59
- "logps/rejected": -254.49044799804688,
60
- "loss": 1.1358,
61
- "rewards/accuracies": 0.550000011920929,
62
- "rewards/chosen": 0.0014447562862187624,
63
- "rewards/margins": 0.001622636104002595,
64
- "rewards/rejected": -0.0001778797450242564,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.04,
69
- "learning_rate": 2.0833333333333333e-07,
70
- "logits/chosen": -2.65032696723938,
71
- "logits/rejected": -2.6380372047424316,
72
- "logps/chosen": -273.85882568359375,
73
- "logps/rejected": -237.75418090820312,
74
- "loss": 1.1334,
75
- "rewards/accuracies": 0.65625,
76
- "rewards/chosen": 0.0036526708863675594,
77
- "rewards/margins": 0.006543067749589682,
78
- "rewards/rejected": -0.002890397561714053,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.05,
83
- "learning_rate": 2.604166666666667e-07,
84
- "logits/chosen": -2.6743686199188232,
85
- "logits/rejected": -2.638240337371826,
86
- "logps/chosen": -296.05084228515625,
87
- "logps/rejected": -274.6942138671875,
88
- "loss": 1.1279,
89
- "rewards/accuracies": 0.699999988079071,
90
- "rewards/chosen": 0.00925111211836338,
91
- "rewards/margins": 0.01603672280907631,
92
- "rewards/rejected": -0.0067856102250516415,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.06,
97
- "learning_rate": 3.1249999999999997e-07,
98
- "logits/chosen": -2.6307342052459717,
99
- "logits/rejected": -2.633100986480713,
100
- "logps/chosen": -285.3183288574219,
101
- "logps/rejected": -274.36505126953125,
102
- "loss": 1.1188,
103
  "rewards/accuracies": 0.6625000238418579,
104
- "rewards/chosen": 0.024691741913557053,
105
- "rewards/margins": 0.02844650112092495,
106
- "rewards/rejected": -0.0037547596730291843,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.07,
111
- "learning_rate": 3.645833333333333e-07,
112
- "logits/chosen": -2.649703025817871,
113
- "logits/rejected": -2.6850831508636475,
114
- "logps/chosen": -311.02667236328125,
115
- "logps/rejected": -290.6634521484375,
116
- "loss": 1.0997,
117
- "rewards/accuracies": 0.6875,
118
- "rewards/chosen": 0.03477492183446884,
119
- "rewards/margins": 0.05690314620733261,
120
- "rewards/rejected": -0.02212822437286377,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.08,
125
- "learning_rate": 4.1666666666666667e-07,
126
- "logits/chosen": -2.523608446121216,
127
- "logits/rejected": -2.4657652378082275,
128
- "logps/chosen": -304.8270568847656,
129
- "logps/rejected": -282.1708068847656,
130
- "loss": 1.0646,
131
- "rewards/accuracies": 0.7562500238418579,
132
- "rewards/chosen": -0.04266184940934181,
133
- "rewards/margins": 0.1203114241361618,
134
- "rewards/rejected": -0.1629732847213745,
135
  "step": 80
136
  },
137
  {
138
- "epoch": 0.09,
139
- "learning_rate": 4.6874999999999996e-07,
140
- "logits/chosen": -2.5294718742370605,
141
- "logits/rejected": -2.5001423358917236,
142
- "logps/chosen": -293.15252685546875,
143
- "logps/rejected": -285.61920166015625,
144
- "loss": 1.0475,
145
- "rewards/accuracies": 0.699999988079071,
146
- "rewards/chosen": -0.012959107756614685,
147
- "rewards/margins": 0.17945662140846252,
148
- "rewards/rejected": -0.1924157440662384,
149
  "step": 90
150
  },
151
  {
152
- "epoch": 0.1,
153
- "learning_rate": 4.999732492681437e-07,
154
- "logits/chosen": -2.480851650238037,
155
- "logits/rejected": -2.468801259994507,
156
- "logps/chosen": -335.23480224609375,
157
- "logps/rejected": -339.78021240234375,
158
- "loss": 1.0407,
159
- "rewards/accuracies": 0.6937500238418579,
160
- "rewards/chosen": -0.2980949878692627,
161
- "rewards/margins": 0.177928164601326,
162
- "rewards/rejected": -0.4760231375694275,
163
  "step": 100
164
  },
165
  {
166
- "epoch": 0.1,
167
- "eval_logits/chosen": -2.5232999324798584,
168
- "eval_logits/rejected": -2.486931324005127,
169
- "eval_logps/chosen": -320.90283203125,
170
- "eval_logps/rejected": -321.2205810546875,
171
- "eval_loss": 1.0318840742111206,
172
- "eval_rewards/accuracies": 0.6845238208770752,
173
- "eval_rewards/chosen": -0.36793097853660583,
174
- "eval_rewards/margins": 0.2264895737171173,
175
- "eval_rewards/rejected": -0.5944206118583679,
176
- "eval_runtime": 243.2829,
177
- "eval_samples_per_second": 8.221,
178
- "eval_steps_per_second": 0.259,
179
  "step": 100
180
  },
181
  {
182
- "epoch": 0.12,
183
- "learning_rate": 4.996723692767926e-07,
184
- "logits/chosen": -2.2419214248657227,
185
- "logits/rejected": -2.1671438217163086,
186
- "logps/chosen": -305.33355712890625,
187
- "logps/rejected": -294.58404541015625,
188
- "loss": 1.0052,
189
- "rewards/accuracies": 0.6875,
190
- "rewards/chosen": -0.4501020312309265,
191
- "rewards/margins": 0.32556745409965515,
192
- "rewards/rejected": -0.7756695747375488,
193
  "step": 110
194
  },
195
  {
196
- "epoch": 0.13,
197
- "learning_rate": 4.990375746213598e-07,
198
- "logits/chosen": -1.2528715133666992,
199
- "logits/rejected": -1.0699832439422607,
200
- "logps/chosen": -357.62774658203125,
201
- "logps/rejected": -348.74688720703125,
202
- "loss": 0.9576,
203
- "rewards/accuracies": 0.71875,
204
- "rewards/chosen": -0.5205128788948059,
205
- "rewards/margins": 0.43326228857040405,
206
- "rewards/rejected": -0.9537751078605652,
207
  "step": 120
208
  },
209
  {
210
- "epoch": 0.14,
211
- "learning_rate": 4.980697142834314e-07,
212
- "logits/chosen": -0.781643271446228,
213
- "logits/rejected": -0.5821924209594727,
214
- "logps/chosen": -406.4781799316406,
215
- "logps/rejected": -411.183837890625,
216
- "loss": 0.9475,
217
- "rewards/accuracies": 0.6875,
218
- "rewards/chosen": -0.8618149757385254,
219
- "rewards/margins": 0.5553635954856873,
220
- "rewards/rejected": -1.4171785116195679,
221
  "step": 130
222
  },
223
  {
224
- "epoch": 0.15,
225
- "learning_rate": 4.967700826904229e-07,
226
- "logits/chosen": -0.470319926738739,
227
- "logits/rejected": -0.3660030961036682,
228
- "logps/chosen": -321.44873046875,
229
- "logps/rejected": -374.0142517089844,
230
- "loss": 0.8765,
231
- "rewards/accuracies": 0.7124999761581421,
232
- "rewards/chosen": -0.8166507482528687,
233
- "rewards/margins": 0.6684588193893433,
234
- "rewards/rejected": -1.485109567642212,
235
  "step": 140
236
  },
237
  {
238
- "epoch": 0.16,
239
- "learning_rate": 4.951404179843962e-07,
240
- "logits/chosen": -0.061715979129076004,
241
- "logits/rejected": -0.19356076419353485,
242
- "logps/chosen": -405.5045471191406,
243
- "logps/rejected": -456.53271484375,
244
- "loss": 0.9764,
245
- "rewards/accuracies": 0.6187499761581421,
246
- "rewards/chosen": -1.2846033573150635,
247
- "rewards/margins": 0.43445801734924316,
248
- "rewards/rejected": -1.7190614938735962,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.17,
253
- "learning_rate": 4.931828996974498e-07,
254
- "logits/chosen": -1.1201887130737305,
255
- "logits/rejected": -0.6151416897773743,
256
- "logps/chosen": -380.142822265625,
257
- "logps/rejected": -427.4412536621094,
258
- "loss": 0.8888,
259
- "rewards/accuracies": 0.737500011920929,
260
- "rewards/chosen": -0.8433526754379272,
261
- "rewards/margins": 0.5253406763076782,
262
- "rewards/rejected": -1.368693232536316,
263
  "step": 160
264
  },
265
  {
266
- "epoch": 0.18,
267
- "learning_rate": 4.909001458367866e-07,
268
- "logits/chosen": -0.5463464260101318,
269
- "logits/rejected": -0.36673182249069214,
270
- "logps/chosen": -386.74749755859375,
271
- "logps/rejected": -429.04156494140625,
272
- "loss": 0.8703,
273
- "rewards/accuracies": 0.6937500238418579,
274
- "rewards/chosen": -1.1368268728256226,
275
- "rewards/margins": 0.681709885597229,
276
- "rewards/rejected": -1.8185367584228516,
277
  "step": 170
278
  },
279
  {
280
- "epoch": 0.19,
281
- "learning_rate": 4.882952093833627e-07,
282
- "logits/chosen": 0.5949804186820984,
283
- "logits/rejected": 0.9649137258529663,
284
- "logps/chosen": -385.0345764160156,
285
- "logps/rejected": -451.2206115722656,
286
- "loss": 0.8344,
287
- "rewards/accuracies": 0.699999988079071,
288
- "rewards/chosen": -1.35834801197052,
289
- "rewards/margins": 0.8335322141647339,
290
- "rewards/rejected": -2.191880464553833,
291
  "step": 180
292
  },
293
  {
294
- "epoch": 0.2,
295
- "learning_rate": 4.853715742087946e-07,
296
- "logits/chosen": 0.7401331663131714,
297
- "logits/rejected": 1.3593542575836182,
298
- "logps/chosen": -480.5682067871094,
299
- "logps/rejected": -533.142333984375,
300
- "loss": 0.8896,
301
- "rewards/accuracies": 0.6937500238418579,
302
- "rewards/chosen": -2.017334461212158,
303
- "rewards/margins": 0.7530598044395447,
304
- "rewards/rejected": -2.7703945636749268,
305
  "step": 190
306
  },
307
  {
308
- "epoch": 0.21,
309
- "learning_rate": 4.821331504159906e-07,
310
- "logits/chosen": 0.07484010607004166,
311
- "logits/rejected": 0.5299566388130188,
312
- "logps/chosen": -432.9810485839844,
313
- "logps/rejected": -490.89599609375,
314
- "loss": 0.8874,
315
- "rewards/accuracies": 0.71875,
316
- "rewards/chosen": -1.4631267786026,
317
- "rewards/margins": 0.6909047365188599,
318
- "rewards/rejected": -2.154031276702881,
319
  "step": 200
320
  },
321
  {
322
- "epoch": 0.21,
323
- "eval_logits/chosen": -0.9007886648178101,
324
- "eval_logits/rejected": -0.44379544258117676,
325
- "eval_logps/chosen": -366.1669616699219,
326
- "eval_logps/rejected": -415.2938537597656,
327
- "eval_loss": 0.8947122097015381,
328
- "eval_rewards/accuracies": 0.7480158805847168,
329
- "eval_rewards/chosen": -0.8205717206001282,
330
- "eval_rewards/margins": 0.7145815491676331,
331
- "eval_rewards/rejected": -1.5351535081863403,
332
- "eval_runtime": 244.6801,
333
- "eval_samples_per_second": 8.174,
334
- "eval_steps_per_second": 0.257,
335
  "step": 200
336
  },
337
  {
338
- "epoch": 0.22,
339
- "learning_rate": 4.785842691097342e-07,
340
- "logits/chosen": -0.7449830770492554,
341
- "logits/rejected": -0.08534111082553864,
342
- "logps/chosen": -388.907470703125,
343
- "logps/rejected": -403.8260498046875,
344
- "loss": 0.8806,
345
- "rewards/accuracies": 0.7124999761581421,
346
- "rewards/chosen": -0.8363178968429565,
347
- "rewards/margins": 0.6629363298416138,
348
- "rewards/rejected": -1.4992539882659912,
349
  "step": 210
350
  },
351
  {
352
- "epoch": 0.23,
353
- "learning_rate": 4.7472967660421603e-07,
354
- "logits/chosen": 0.45267248153686523,
355
- "logits/rejected": 0.8757249116897583,
356
- "logps/chosen": -412.01007080078125,
357
- "logps/rejected": -479.5518493652344,
358
- "loss": 0.8932,
359
- "rewards/accuracies": 0.737500011920929,
360
- "rewards/chosen": -1.2247607707977295,
361
- "rewards/margins": 0.8075494766235352,
362
- "rewards/rejected": -2.0323100090026855,
363
  "step": 220
364
  },
365
  {
366
- "epoch": 0.24,
367
- "learning_rate": 4.705745280752585e-07,
368
- "logits/chosen": 0.03533775731921196,
369
- "logits/rejected": 0.6978858709335327,
370
- "logps/chosen": -428.5148010253906,
371
- "logps/rejected": -484.0467834472656,
372
- "loss": 0.8581,
373
- "rewards/accuracies": 0.7562500238418579,
374
- "rewards/chosen": -1.3480390310287476,
375
- "rewards/margins": 1.0059764385223389,
376
- "rewards/rejected": -2.354015588760376,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.25,
381
- "learning_rate": 4.6612438066572555e-07,
382
- "logits/chosen": -0.08093588799238205,
383
- "logits/rejected": 0.7299788594245911,
384
- "logps/chosen": -392.78790283203125,
385
- "logps/rejected": -432.1263122558594,
386
- "loss": 0.8099,
387
- "rewards/accuracies": 0.7250000238418579,
388
- "rewards/chosen": -1.1553189754486084,
389
- "rewards/margins": 0.9055362939834595,
390
- "rewards/rejected": -2.0608553886413574,
391
  "step": 240
392
  },
393
  {
394
- "epoch": 0.26,
395
- "learning_rate": 4.6138518605333664e-07,
396
- "logits/chosen": 0.8494974970817566,
397
- "logits/rejected": 1.0580047369003296,
398
- "logps/chosen": -410.2496032714844,
399
- "logps/rejected": -510.062744140625,
400
- "loss": 0.857,
401
- "rewards/accuracies": 0.71875,
402
- "rewards/chosen": -1.4266130924224854,
403
- "rewards/margins": 0.8598226308822632,
404
- "rewards/rejected": -2.286435604095459,
405
  "step": 250
406
  },
407
  {
408
- "epoch": 0.27,
409
- "learning_rate": 4.5636328249082514e-07,
410
- "logits/chosen": 0.6665963530540466,
411
- "logits/rejected": 1.3317415714263916,
412
- "logps/chosen": -429.8251037597656,
413
- "logps/rejected": -490.12237548828125,
414
- "loss": 0.8674,
415
- "rewards/accuracies": 0.7437499761581421,
416
- "rewards/chosen": -1.4576373100280762,
417
- "rewards/margins": 0.86492520570755,
418
- "rewards/rejected": -2.3225626945495605,
419
  "step": 260
420
  },
421
  {
422
- "epoch": 0.28,
423
- "learning_rate": 4.510653863290871e-07,
424
- "logits/chosen": -0.21562974154949188,
425
- "logits/rejected": 0.4239919185638428,
426
- "logps/chosen": -415.57073974609375,
427
- "logps/rejected": -461.1295471191406,
428
- "loss": 0.8284,
429
- "rewards/accuracies": 0.75,
430
- "rewards/chosen": -1.1371711492538452,
431
- "rewards/margins": 0.9719129800796509,
432
- "rewards/rejected": -2.109084367752075,
433
  "step": 270
434
  },
435
  {
436
- "epoch": 0.29,
437
- "learning_rate": 4.4549858303465737e-07,
438
- "logits/chosen": 0.07179277390241623,
439
- "logits/rejected": 0.6087414026260376,
440
- "logps/chosen": -405.73614501953125,
441
- "logps/rejected": -490.55908203125,
442
- "loss": 0.8244,
443
- "rewards/accuracies": 0.7749999761581421,
444
- "rewards/chosen": -1.145477056503296,
445
- "rewards/margins": 0.9263471364974976,
446
- "rewards/rejected": -2.071824312210083,
447
  "step": 280
448
  },
449
  {
450
- "epoch": 0.3,
451
- "learning_rate": 4.396703177135261e-07,
452
- "logits/chosen": 0.4951688349246979,
453
- "logits/rejected": 0.9030885696411133,
454
- "logps/chosen": -423.0415954589844,
455
- "logps/rejected": -468.3642578125,
456
- "loss": 0.852,
457
- "rewards/accuracies": 0.71875,
458
- "rewards/chosen": -1.3801145553588867,
459
- "rewards/margins": 0.780128002166748,
460
- "rewards/rejected": -2.1602425575256348,
461
  "step": 290
462
  },
463
  {
464
- "epoch": 0.31,
465
- "learning_rate": 4.335883851539693e-07,
466
- "logits/chosen": 0.6263070106506348,
467
- "logits/rejected": 1.3198049068450928,
468
- "logps/chosen": -407.26031494140625,
469
- "logps/rejected": -485.51983642578125,
470
- "loss": 0.8068,
471
- "rewards/accuracies": 0.768750011920929,
472
- "rewards/chosen": -1.2382786273956299,
473
- "rewards/margins": 1.1074926853179932,
474
- "rewards/rejected": -2.345771074295044,
475
  "step": 300
476
  },
477
  {
478
- "epoch": 0.31,
479
- "eval_logits/chosen": 0.8117178082466125,
480
- "eval_logits/rejected": 1.4832301139831543,
481
- "eval_logps/chosen": -430.772216796875,
482
- "eval_logps/rejected": -509.59326171875,
483
- "eval_loss": 0.8381595015525818,
484
- "eval_rewards/accuracies": 0.7539682388305664,
485
- "eval_rewards/chosen": -1.4666248559951782,
486
- "eval_rewards/margins": 1.0115222930908203,
487
- "eval_rewards/rejected": -2.478147029876709,
488
- "eval_runtime": 243.7025,
489
- "eval_samples_per_second": 8.207,
490
- "eval_steps_per_second": 0.259,
491
  "step": 300
492
  },
493
  {
494
- "epoch": 0.32,
495
- "learning_rate": 4.272609194017105e-07,
496
- "logits/chosen": 0.9425480961799622,
497
- "logits/rejected": 1.6004616022109985,
498
- "logps/chosen": -424.9742126464844,
499
- "logps/rejected": -530.65869140625,
500
- "loss": 0.7827,
501
- "rewards/accuracies": 0.768750011920929,
502
- "rewards/chosen": -1.427599549293518,
503
- "rewards/margins": 1.1877686977386475,
504
- "rewards/rejected": -2.615368127822876,
505
  "step": 310
506
  },
507
  {
508
- "epoch": 0.33,
509
- "learning_rate": 4.2069638288135547e-07,
510
- "logits/chosen": 0.7390011548995972,
511
- "logits/rejected": 1.4064347743988037,
512
- "logps/chosen": -448.7197265625,
513
- "logps/rejected": -525.7418823242188,
514
- "loss": 0.827,
515
- "rewards/accuracies": 0.6875,
516
- "rewards/chosen": -1.670648217201233,
517
- "rewards/margins": 0.8930233120918274,
518
- "rewards/rejected": -2.563671112060547,
519
  "step": 320
520
  },
521
  {
522
- "epoch": 0.35,
523
- "learning_rate": 4.139035550786494e-07,
524
- "logits/chosen": 1.1555938720703125,
525
- "logits/rejected": 1.553986668586731,
526
- "logps/chosen": -460.4817810058594,
527
- "logps/rejected": -495.1622009277344,
528
- "loss": 0.8771,
529
- "rewards/accuracies": 0.699999988079071,
530
- "rewards/chosen": -1.8090622425079346,
531
- "rewards/margins": 0.8104559779167175,
532
- "rewards/rejected": -2.619518280029297,
533
  "step": 330
534
  },
535
  {
536
- "epoch": 0.36,
537
- "learning_rate": 4.0689152079869306e-07,
538
- "logits/chosen": 0.858076274394989,
539
- "logits/rejected": 1.5555229187011719,
540
- "logps/chosen": -441.11328125,
541
- "logps/rejected": -489.79693603515625,
542
- "loss": 0.8968,
543
- "rewards/accuracies": 0.65625,
544
- "rewards/chosen": -1.953595757484436,
545
- "rewards/margins": 0.7021313905715942,
546
- "rewards/rejected": -2.655726909637451,
547
  "step": 340
548
  },
549
  {
550
- "epoch": 0.37,
551
- "learning_rate": 3.99669658015821e-07,
552
- "logits/chosen": 0.8753985166549683,
553
- "logits/rejected": 1.0402195453643799,
554
- "logps/chosen": -438.79534912109375,
555
- "logps/rejected": -550.7193603515625,
556
- "loss": 0.8261,
557
- "rewards/accuracies": 0.7124999761581421,
558
- "rewards/chosen": -1.6666584014892578,
559
- "rewards/margins": 1.0503871440887451,
560
- "rewards/rejected": -2.717045307159424,
561
  "step": 350
562
  },
563
  {
564
- "epoch": 0.38,
565
- "learning_rate": 3.92247625331392e-07,
566
- "logits/chosen": 0.5132797360420227,
567
- "logits/rejected": 1.0654723644256592,
568
- "logps/chosen": -414.5316467285156,
569
- "logps/rejected": -473.7266540527344,
570
- "loss": 0.8232,
571
- "rewards/accuracies": 0.7250000238418579,
572
- "rewards/chosen": -1.317946195602417,
573
- "rewards/margins": 0.9921468496322632,
574
- "rewards/rejected": -2.3100931644439697,
575
  "step": 360
576
  },
577
  {
578
- "epoch": 0.39,
579
- "learning_rate": 3.846353490562664e-07,
580
- "logits/chosen": 0.6082018613815308,
581
- "logits/rejected": 1.0670816898345947,
582
- "logps/chosen": -366.9786071777344,
583
- "logps/rejected": -486.1373596191406,
584
- "loss": 0.7788,
585
- "rewards/accuracies": 0.78125,
586
- "rewards/chosen": -1.2683700323104858,
587
- "rewards/margins": 1.0532209873199463,
588
- "rewards/rejected": -2.3215911388397217,
589
  "step": 370
590
  },
591
  {
592
- "epoch": 0.4,
593
- "learning_rate": 3.768430099352445e-07,
594
- "logits/chosen": 0.43745535612106323,
595
- "logits/rejected": 1.5503931045532227,
596
- "logps/chosen": -478.939208984375,
597
- "logps/rejected": -556.5857543945312,
598
- "loss": 0.7987,
599
- "rewards/accuracies": 0.762499988079071,
600
- "rewards/chosen": -1.8233133554458618,
601
- "rewards/margins": 1.1179938316345215,
602
- "rewards/rejected": -2.9413070678710938,
603
  "step": 380
604
  },
605
  {
606
- "epoch": 0.41,
607
- "learning_rate": 3.6888102953122304e-07,
608
- "logits/chosen": 1.4241774082183838,
609
- "logits/rejected": 2.120664596557617,
610
- "logps/chosen": -464.5999450683594,
611
- "logps/rejected": -538.8297729492188,
612
- "loss": 0.7897,
613
- "rewards/accuracies": 0.731249988079071,
614
- "rewards/chosen": -1.8883854150772095,
615
- "rewards/margins": 1.1485675573349,
616
- "rewards/rejected": -3.0369527339935303,
617
  "step": 390
618
  },
619
  {
620
- "epoch": 0.42,
621
- "learning_rate": 3.607600562872785e-07,
622
- "logits/chosen": 1.6316184997558594,
623
- "logits/rejected": 2.457326650619507,
624
- "logps/chosen": -501.25262451171875,
625
- "logps/rejected": -549.1324462890625,
626
- "loss": 0.7845,
627
- "rewards/accuracies": 0.706250011920929,
628
- "rewards/chosen": -1.9686987400054932,
629
- "rewards/margins": 0.8676248788833618,
630
- "rewards/rejected": -2.8363232612609863,
631
  "step": 400
632
  },
633
  {
634
- "epoch": 0.42,
635
- "eval_logits/chosen": 2.1264495849609375,
636
- "eval_logits/rejected": 2.8754770755767822,
637
- "eval_logps/chosen": -461.9886779785156,
638
- "eval_logps/rejected": -553.6510009765625,
639
- "eval_loss": 0.8208896517753601,
640
- "eval_rewards/accuracies": 0.7519841194152832,
641
- "eval_rewards/chosen": -1.7787890434265137,
642
- "eval_rewards/margins": 1.1399353742599487,
643
- "eval_rewards/rejected": -2.91872501373291,
644
- "eval_runtime": 243.2973,
645
- "eval_samples_per_second": 8.22,
646
- "eval_steps_per_second": 0.259,
647
  "step": 400
648
  },
649
  {
650
- "epoch": 0.43,
651
- "learning_rate": 3.5249095128531856e-07,
652
- "logits/chosen": 1.401972770690918,
653
- "logits/rejected": 2.366103410720825,
654
- "logps/chosen": -486.5357971191406,
655
- "logps/rejected": -573.39013671875,
656
- "loss": 0.8379,
657
- "rewards/accuracies": 0.706250011920929,
658
- "rewards/chosen": -1.7267777919769287,
659
- "rewards/margins": 1.0969970226287842,
660
- "rewards/rejected": -2.823775053024292,
661
  "step": 410
662
  },
663
  {
664
- "epoch": 0.44,
665
- "learning_rate": 3.4408477372034736e-07,
666
- "logits/chosen": 1.4769244194030762,
667
- "logits/rejected": 2.5726444721221924,
668
- "logps/chosen": -433.69610595703125,
669
- "logps/rejected": -487.23077392578125,
670
- "loss": 0.823,
671
- "rewards/accuracies": 0.6812499761581421,
672
- "rewards/chosen": -1.6958898305892944,
673
- "rewards/margins": 0.8770249485969543,
674
- "rewards/rejected": -2.5729150772094727,
675
  "step": 420
676
  },
677
  {
678
- "epoch": 0.45,
679
- "learning_rate": 3.3555276610977276e-07,
680
- "logits/chosen": 2.185957431793213,
681
- "logits/rejected": 3.1616008281707764,
682
- "logps/chosen": -451.7828063964844,
683
- "logps/rejected": -523.4830932617188,
684
- "loss": 0.8662,
685
- "rewards/accuracies": 0.71875,
686
- "rewards/chosen": -1.8958429098129272,
687
- "rewards/margins": 0.9449018239974976,
688
- "rewards/rejected": -2.840744733810425,
689
  "step": 430
690
  },
691
  {
692
- "epoch": 0.46,
693
- "learning_rate": 3.269063392575352e-07,
694
- "logits/chosen": 2.3363919258117676,
695
- "logits/rejected": 2.1066231727600098,
696
- "logps/chosen": -447.093994140625,
697
- "logps/rejected": -537.6717529296875,
698
- "loss": 0.8362,
699
- "rewards/accuracies": 0.7124999761581421,
700
- "rewards/chosen": -1.8810195922851562,
701
- "rewards/margins": 1.006998896598816,
702
- "rewards/rejected": -2.8880181312561035,
703
  "step": 440
704
  },
705
  {
706
- "epoch": 0.47,
707
- "learning_rate": 3.1815705699316964e-07,
708
- "logits/chosen": 1.551466464996338,
709
- "logits/rejected": 1.8984363079071045,
710
- "logps/chosen": -445.85894775390625,
711
- "logps/rejected": -539.7463989257812,
712
- "loss": 0.8323,
713
- "rewards/accuracies": 0.737500011920929,
714
- "rewards/chosen": -1.817039132118225,
715
- "rewards/margins": 1.14361572265625,
716
- "rewards/rejected": -2.9606547355651855,
717
  "step": 450
718
  },
719
  {
720
- "epoch": 0.48,
721
- "learning_rate": 3.0931662070620794e-07,
722
- "logits/chosen": 0.859958827495575,
723
- "logits/rejected": 1.7097549438476562,
724
- "logps/chosen": -435.4864807128906,
725
- "logps/rejected": -531.817138671875,
726
- "loss": 0.8005,
727
- "rewards/accuracies": 0.731249988079071,
728
- "rewards/chosen": -1.7045366764068604,
729
- "rewards/margins": 1.0461620092391968,
730
- "rewards/rejected": -2.7506985664367676,
731
  "step": 460
732
  },
733
  {
734
- "epoch": 0.49,
735
- "learning_rate": 3.003968536966078e-07,
736
- "logits/chosen": 1.0430405139923096,
737
- "logits/rejected": 1.8049592971801758,
738
- "logps/chosen": -466.21612548828125,
739
- "logps/rejected": -558.8511962890625,
740
- "loss": 0.7768,
741
- "rewards/accuracies": 0.8062499761581421,
742
- "rewards/chosen": -1.6801469326019287,
743
- "rewards/margins": 1.3035141229629517,
744
- "rewards/rejected": -2.983660936355591,
745
  "step": 470
746
  },
747
  {
748
- "epoch": 0.5,
749
- "learning_rate": 2.9140968536213693e-07,
750
- "logits/chosen": 1.5663492679595947,
751
- "logits/rejected": 2.4222500324249268,
752
- "logps/chosen": -411.6764221191406,
753
- "logps/rejected": -506.650390625,
754
- "loss": 0.8345,
755
- "rewards/accuracies": 0.7562500238418579,
756
- "rewards/chosen": -1.7893803119659424,
757
- "rewards/margins": 0.9885958433151245,
758
- "rewards/rejected": -2.7779765129089355,
759
  "step": 480
760
  },
761
  {
762
- "epoch": 0.51,
763
- "learning_rate": 2.823671352438608e-07,
764
- "logits/chosen": 1.7138950824737549,
765
- "logits/rejected": 2.425443172454834,
766
- "logps/chosen": -452.81439208984375,
767
- "logps/rejected": -527.6519165039062,
768
- "loss": 0.79,
769
- "rewards/accuracies": 0.699999988079071,
770
- "rewards/chosen": -1.7711286544799805,
771
- "rewards/margins": 1.0116374492645264,
772
- "rewards/rejected": -2.782766103744507,
773
  "step": 490
774
  },
775
  {
776
- "epoch": 0.52,
777
- "learning_rate": 2.73281296951072e-07,
778
- "logits/chosen": 1.9489076137542725,
779
- "logits/rejected": 2.5219855308532715,
780
- "logps/chosen": -453.45367431640625,
781
- "logps/rejected": -559.2427978515625,
782
- "loss": 0.8323,
783
- "rewards/accuracies": 0.7437499761581421,
784
- "rewards/chosen": -1.8187179565429688,
785
- "rewards/margins": 1.278545618057251,
786
- "rewards/rejected": -3.0972630977630615,
787
- "step": 500
788
- },
789
- {
790
- "epoch": 0.52,
791
- "eval_logits/chosen": 1.3306254148483276,
792
- "eval_logits/rejected": 2.1307854652404785,
793
- "eval_logps/chosen": -427.62841796875,
794
- "eval_logps/rejected": -516.395263671875,
795
- "eval_loss": 0.8331887722015381,
796
- "eval_rewards/accuracies": 0.7440476417541504,
797
- "eval_rewards/chosen": -1.4351868629455566,
798
- "eval_rewards/margins": 1.1109802722930908,
799
- "eval_rewards/rejected": -2.5461671352386475,
800
- "eval_runtime": 243.742,
801
- "eval_samples_per_second": 8.205,
802
- "eval_steps_per_second": 0.258,
803
  "step": 500
804
  },
805
- {
806
- "epoch": 0.53,
807
- "learning_rate": 2.641643219871597e-07,
808
- "logits/chosen": 1.4478816986083984,
809
- "logits/rejected": 2.14026141166687,
810
- "logps/chosen": -437.3523864746094,
811
- "logps/rejected": -497.67791748046875,
812
- "loss": 0.778,
813
- "rewards/accuracies": 0.737500011920929,
814
- "rewards/chosen": -1.3602588176727295,
815
- "rewards/margins": 1.100663423538208,
816
- "rewards/rejected": -2.4609227180480957,
817
- "step": 510
818
- },
819
- {
820
- "epoch": 0.54,
821
- "learning_rate": 2.550284034980507e-07,
822
- "logits/chosen": 1.3803322315216064,
823
- "logits/rejected": 2.3976683616638184,
824
- "logps/chosen": -412.71685791015625,
825
- "logps/rejected": -522.4058837890625,
826
- "loss": 0.797,
827
- "rewards/accuracies": 0.7562500238418579,
828
- "rewards/chosen": -1.535652756690979,
829
- "rewards/margins": 1.171526312828064,
830
- "rewards/rejected": -2.707179069519043,
831
- "step": 520
832
- },
833
- {
834
- "epoch": 0.55,
835
- "learning_rate": 2.4588575996495794e-07,
836
- "logits/chosen": 2.3339693546295166,
837
- "logits/rejected": 3.0342774391174316,
838
- "logps/chosen": -474.455078125,
839
- "logps/rejected": -575.391357421875,
840
- "loss": 0.7824,
841
- "rewards/accuracies": 0.71875,
842
- "rewards/chosen": -1.9030723571777344,
843
- "rewards/margins": 1.281085729598999,
844
- "rewards/rejected": -3.1841578483581543,
845
- "step": 530
846
- },
847
- {
848
- "epoch": 0.57,
849
- "learning_rate": 2.367486188632446e-07,
850
- "logits/chosen": 2.5718750953674316,
851
- "logits/rejected": 3.330522060394287,
852
- "logps/chosen": -513.03955078125,
853
- "logps/rejected": -665.7197875976562,
854
- "loss": 0.8011,
855
- "rewards/accuracies": 0.762499988079071,
856
- "rewards/chosen": -2.188791036605835,
857
- "rewards/margins": 1.4658076763153076,
858
- "rewards/rejected": -3.6545987129211426,
859
- "step": 540
860
- },
861
- {
862
- "epoch": 0.58,
863
- "learning_rate": 2.276292003092593e-07,
864
- "logits/chosen": 3.1903679370880127,
865
- "logits/rejected": 4.050724983215332,
866
- "logps/chosen": -495.56915283203125,
867
- "logps/rejected": -606.86376953125,
868
- "loss": 0.8026,
869
- "rewards/accuracies": 0.768750011920929,
870
- "rewards/chosen": -2.2584142684936523,
871
- "rewards/margins": 1.3304917812347412,
872
- "rewards/rejected": -3.5889060497283936,
873
- "step": 550
874
- },
875
- {
876
- "epoch": 0.59,
877
- "learning_rate": 2.185397007170141e-07,
878
- "logits/chosen": 3.2466537952423096,
879
- "logits/rejected": 3.8034489154815674,
880
- "logps/chosen": -482.8548278808594,
881
- "logps/rejected": -559.918701171875,
882
- "loss": 0.8114,
883
- "rewards/accuracies": 0.7250000238418579,
884
- "rewards/chosen": -2.160890817642212,
885
- "rewards/margins": 1.1038119792938232,
886
- "rewards/rejected": -3.264702558517456,
887
- "step": 560
888
- },
889
- {
890
- "epoch": 0.6,
891
- "learning_rate": 2.094922764865619e-07,
892
- "logits/chosen": 2.1195971965789795,
893
- "logits/rejected": 3.2914786338806152,
894
- "logps/chosen": -474.29058837890625,
895
- "logps/rejected": -559.2501831054688,
896
- "loss": 0.8237,
897
- "rewards/accuracies": 0.7124999761581421,
898
- "rewards/chosen": -1.981508493423462,
899
- "rewards/margins": 1.0219013690948486,
900
- "rewards/rejected": -3.0034098625183105,
901
- "step": 570
902
- },
903
- {
904
- "epoch": 0.61,
905
- "learning_rate": 2.0049902774588797e-07,
906
- "logits/chosen": 2.163456916809082,
907
- "logits/rejected": 3.159027576446533,
908
- "logps/chosen": -483.96038818359375,
909
- "logps/rejected": -568.642578125,
910
- "loss": 0.817,
911
- "rewards/accuracies": 0.71875,
912
- "rewards/chosen": -2.100404977798462,
913
- "rewards/margins": 1.2073109149932861,
914
- "rewards/rejected": -3.307715654373169,
915
- "step": 580
916
- },
917
- {
918
- "epoch": 0.62,
919
- "learning_rate": 1.9157198216806238e-07,
920
- "logits/chosen": 1.664136528968811,
921
- "logits/rejected": 2.9907596111297607,
922
- "logps/chosen": -458.36456298828125,
923
- "logps/rejected": -561.4249267578125,
924
- "loss": 0.7903,
925
- "rewards/accuracies": 0.71875,
926
- "rewards/chosen": -1.8531945943832397,
927
- "rewards/margins": 0.9679274559020996,
928
- "rewards/rejected": -2.8211216926574707,
929
- "step": 590
930
- },
931
- {
932
- "epoch": 0.63,
933
- "learning_rate": 1.8272307888529274e-07,
934
- "logits/chosen": 1.441373348236084,
935
- "logits/rejected": 3.0396907329559326,
936
- "logps/chosen": -518.3441772460938,
937
- "logps/rejected": -635.1202392578125,
938
- "loss": 0.7677,
939
- "rewards/accuracies": 0.7437499761581421,
940
- "rewards/chosen": -1.9236825704574585,
941
- "rewards/margins": 1.306647777557373,
942
- "rewards/rejected": -3.230330228805542,
943
- "step": 600
944
- },
945
- {
946
- "epoch": 0.63,
947
- "eval_logits/chosen": 1.8965519666671753,
948
- "eval_logits/rejected": 3.3207030296325684,
949
- "eval_logps/chosen": -503.2609558105469,
950
- "eval_logps/rejected": -616.7921142578125,
951
- "eval_loss": 0.7981351613998413,
952
- "eval_rewards/accuracies": 0.7519841194152832,
953
- "eval_rewards/chosen": -2.191512107849121,
954
- "eval_rewards/margins": 1.358623743057251,
955
- "eval_rewards/rejected": -3.550135612487793,
956
- "eval_runtime": 243.1992,
957
- "eval_samples_per_second": 8.224,
958
- "eval_steps_per_second": 0.259,
959
- "step": 600
960
- },
961
- {
962
- "epoch": 0.64,
963
- "learning_rate": 1.7396415252139288e-07,
964
- "logits/chosen": 2.0666539669036865,
965
- "logits/rejected": 3.6420180797576904,
966
- "logps/chosen": -494.7054748535156,
967
- "logps/rejected": -573.6849975585938,
968
- "loss": 0.7714,
969
- "rewards/accuracies": 0.75,
970
- "rewards/chosen": -2.1968560218811035,
971
- "rewards/margins": 1.295644760131836,
972
- "rewards/rejected": -3.4925007820129395,
973
- "step": 610
974
- },
975
- {
976
- "epoch": 0.65,
977
- "learning_rate": 1.6530691736402316e-07,
978
- "logits/chosen": 2.116316795349121,
979
- "logits/rejected": 3.3095905780792236,
980
- "logps/chosen": -510.40380859375,
981
- "logps/rejected": -580.3692016601562,
982
- "loss": 0.7844,
983
- "rewards/accuracies": 0.737500011920929,
984
- "rewards/chosen": -2.3837578296661377,
985
- "rewards/margins": 1.1186126470565796,
986
- "rewards/rejected": -3.5023703575134277,
987
- "step": 620
988
- },
989
- {
990
- "epoch": 0.66,
991
- "learning_rate": 1.5676295169786864e-07,
992
- "logits/chosen": 2.523867130279541,
993
- "logits/rejected": 3.438098192214966,
994
- "logps/chosen": -483.1630859375,
995
- "logps/rejected": -625.9393310546875,
996
- "loss": 0.7686,
997
- "rewards/accuracies": 0.7562500238418579,
998
- "rewards/chosen": -2.2157483100891113,
999
- "rewards/margins": 1.489645004272461,
1000
- "rewards/rejected": -3.7053933143615723,
1001
- "step": 630
1002
- },
1003
- {
1004
- "epoch": 0.67,
1005
- "learning_rate": 1.483436823197092e-07,
1006
- "logits/chosen": 1.7444801330566406,
1007
- "logits/rejected": 2.5841469764709473,
1008
- "logps/chosen": -479.9925842285156,
1009
- "logps/rejected": -581.1832885742188,
1010
- "loss": 0.7969,
1011
- "rewards/accuracies": 0.78125,
1012
- "rewards/chosen": -1.9535319805145264,
1013
- "rewards/margins": 1.2860063314437866,
1014
- "rewards/rejected": -3.2395381927490234,
1015
- "step": 640
1016
- },
1017
- {
1018
- "epoch": 0.68,
1019
- "learning_rate": 1.4006036925609243e-07,
1020
- "logits/chosen": 1.0596259832382202,
1021
- "logits/rejected": 2.4878382682800293,
1022
- "logps/chosen": -521.443115234375,
1023
- "logps/rejected": -584.9697875976562,
1024
- "loss": 0.7884,
1025
- "rewards/accuracies": 0.75,
1026
- "rewards/chosen": -2.1512815952301025,
1027
- "rewards/margins": 1.0702219009399414,
1028
- "rewards/rejected": -3.221503734588623,
1029
- "step": 650
1030
- },
1031
- {
1032
- "epoch": 0.69,
1033
- "learning_rate": 1.319240907040458e-07,
1034
- "logits/chosen": 1.1169414520263672,
1035
- "logits/rejected": 1.6374781131744385,
1036
- "logps/chosen": -494.7315368652344,
1037
- "logps/rejected": -578.5076904296875,
1038
- "loss": 0.7997,
1039
- "rewards/accuracies": 0.7562500238418579,
1040
- "rewards/chosen": -1.9317266941070557,
1041
- "rewards/margins": 1.1828477382659912,
1042
- "rewards/rejected": -3.1145741939544678,
1043
- "step": 660
1044
- },
1045
- {
1046
- "epoch": 0.7,
1047
- "learning_rate": 1.239457282149695e-07,
1048
- "logits/chosen": 1.0661325454711914,
1049
- "logits/rejected": 2.232234477996826,
1050
- "logps/chosen": -471.725341796875,
1051
- "logps/rejected": -577.9931640625,
1052
- "loss": 0.791,
1053
- "rewards/accuracies": 0.75,
1054
- "rewards/chosen": -1.8857122659683228,
1055
- "rewards/margins": 1.2069826126098633,
1056
- "rewards/rejected": -3.0926949977874756,
1057
- "step": 670
1058
- },
1059
- {
1060
- "epoch": 0.71,
1061
- "learning_rate": 1.1613595214152711e-07,
1062
- "logits/chosen": 1.3097771406173706,
1063
- "logits/rejected": 1.997799277305603,
1064
- "logps/chosen": -444.1643981933594,
1065
- "logps/rejected": -502.30548095703125,
1066
- "loss": 0.8287,
1067
- "rewards/accuracies": 0.6499999761581421,
1068
- "rewards/chosen": -1.826926827430725,
1069
- "rewards/margins": 0.8142975568771362,
1070
- "rewards/rejected": -2.6412243843078613,
1071
- "step": 680
1072
- },
1073
- {
1074
- "epoch": 0.72,
1075
- "learning_rate": 1.0850520736699362e-07,
1076
- "logits/chosen": 0.9592529535293579,
1077
- "logits/rejected": 2.325680732727051,
1078
- "logps/chosen": -430.89532470703125,
1079
- "logps/rejected": -537.284423828125,
1080
- "loss": 0.7625,
1081
- "rewards/accuracies": 0.7562500238418579,
1082
- "rewards/chosen": -1.6944091320037842,
1083
- "rewards/margins": 1.2595393657684326,
1084
- "rewards/rejected": -2.953948497772217,
1085
- "step": 690
1086
- },
1087
- {
1088
- "epoch": 0.73,
1089
- "learning_rate": 1.0106369933615042e-07,
1090
- "logits/chosen": 1.8058967590332031,
1091
- "logits/rejected": 3.336688280105591,
1092
- "logps/chosen": -486.33380126953125,
1093
- "logps/rejected": -616.1010131835938,
1094
- "loss": 0.7227,
1095
- "rewards/accuracies": 0.762499988079071,
1096
- "rewards/chosen": -2.3213586807250977,
1097
- "rewards/margins": 1.3918098211288452,
1098
- "rewards/rejected": -3.7131690979003906,
1099
- "step": 700
1100
- },
1101
- {
1102
- "epoch": 0.73,
1103
- "eval_logits/chosen": 1.7450497150421143,
1104
- "eval_logits/rejected": 2.97623610496521,
1105
- "eval_logps/chosen": -507.26715087890625,
1106
- "eval_logps/rejected": -623.6929321289062,
1107
- "eval_loss": 0.7834469079971313,
1108
- "eval_rewards/accuracies": 0.7638888955116272,
1109
- "eval_rewards/chosen": -2.2315735816955566,
1110
- "eval_rewards/margins": 1.3875702619552612,
1111
- "eval_rewards/rejected": -3.6191442012786865,
1112
- "eval_runtime": 244.2063,
1113
- "eval_samples_per_second": 8.19,
1114
- "eval_steps_per_second": 0.258,
1115
- "step": 700
1116
- },
1117
- {
1118
- "epoch": 0.74,
1119
- "learning_rate": 9.382138040640714e-08,
1120
- "logits/chosen": 1.7643959522247314,
1121
- "logits/rejected": 3.0291295051574707,
1122
- "logps/chosen": -539.8101806640625,
1123
- "logps/rejected": -597.0071411132812,
1124
- "loss": 0.7477,
1125
- "rewards/accuracies": 0.6937500238418579,
1126
- "rewards/chosen": -2.3488943576812744,
1127
- "rewards/margins": 1.2158434391021729,
1128
- "rewards/rejected": -3.5647377967834473,
1129
- "step": 710
1130
- },
1131
- {
1132
- "epoch": 0.75,
1133
- "learning_rate": 8.678793653740632e-08,
1134
- "logits/chosen": 2.335305690765381,
1135
- "logits/rejected": 3.246683120727539,
1136
- "logps/chosen": -533.5028076171875,
1137
- "logps/rejected": -652.6378173828125,
1138
- "loss": 0.759,
1139
- "rewards/accuracies": 0.7437499761581421,
1140
- "rewards/chosen": -2.420844554901123,
1141
- "rewards/margins": 1.4402214288711548,
1142
- "rewards/rejected": -3.8610661029815674,
1143
- "step": 720
1144
- },
1145
- {
1146
- "epoch": 0.76,
1147
- "learning_rate": 7.997277433690983e-08,
1148
- "logits/chosen": 2.274118661880493,
1149
- "logits/rejected": 3.46764874458313,
1150
- "logps/chosen": -541.6676025390625,
1151
- "logps/rejected": -625.87744140625,
1152
- "loss": 0.7646,
1153
- "rewards/accuracies": 0.762499988079071,
1154
- "rewards/chosen": -2.377727746963501,
1155
- "rewards/margins": 1.3614501953125,
1156
- "rewards/rejected": -3.73917818069458,
1157
- "step": 730
1158
- },
1159
- {
1160
- "epoch": 0.77,
1161
- "learning_rate": 7.338500848029602e-08,
1162
- "logits/chosen": 2.568730592727661,
1163
- "logits/rejected": 3.9025726318359375,
1164
- "logps/chosen": -562.5526123046875,
1165
- "logps/rejected": -680.5411376953125,
1166
- "loss": 0.7825,
1167
- "rewards/accuracies": 0.824999988079071,
1168
- "rewards/chosen": -2.482536792755127,
1169
- "rewards/margins": 1.7319949865341187,
1170
- "rewards/rejected": -4.214531898498535,
1171
- "step": 740
1172
- },
1173
- {
1174
- "epoch": 0.78,
1175
- "learning_rate": 6.70334495204884e-08,
1176
- "logits/chosen": 2.1631178855895996,
1177
- "logits/rejected": 3.1983213424682617,
1178
- "logps/chosen": -532.3298950195312,
1179
- "logps/rejected": -670.1683349609375,
1180
- "loss": 0.7519,
1181
- "rewards/accuracies": 0.737500011920929,
1182
- "rewards/chosen": -2.4897987842559814,
1183
- "rewards/margins": 1.4266437292099,
1184
- "rewards/rejected": -3.916442394256592,
1185
- "step": 750
1186
- },
1187
- {
1188
- "epoch": 0.8,
1189
- "learning_rate": 6.092659210462231e-08,
1190
- "logits/chosen": 2.325679302215576,
1191
- "logits/rejected": 2.9977734088897705,
1192
- "logps/chosen": -529.2430419921875,
1193
- "logps/rejected": -633.878173828125,
1194
- "loss": 0.7115,
1195
- "rewards/accuracies": 0.762499988079071,
1196
- "rewards/chosen": -2.5113611221313477,
1197
- "rewards/margins": 1.3206489086151123,
1198
- "rewards/rejected": -3.832010269165039,
1199
- "step": 760
1200
- },
1201
- {
1202
- "epoch": 0.81,
1203
- "learning_rate": 5.507260361320737e-08,
1204
- "logits/chosen": 2.3807642459869385,
1205
- "logits/rejected": 3.3292059898376465,
1206
- "logps/chosen": -568.1010131835938,
1207
- "logps/rejected": -696.64453125,
1208
- "loss": 0.7886,
1209
- "rewards/accuracies": 0.7250000238418579,
1210
- "rewards/chosen": -2.7410290241241455,
1211
- "rewards/margins": 1.2305468320846558,
1212
- "rewards/rejected": -3.971575975418091,
1213
- "step": 770
1214
- },
1215
- {
1216
- "epoch": 0.82,
1217
- "learning_rate": 4.947931323697982e-08,
1218
- "logits/chosen": 2.637059211730957,
1219
- "logits/rejected": 3.690800905227661,
1220
- "logps/chosen": -514.2804565429688,
1221
- "logps/rejected": -613.6573486328125,
1222
- "loss": 0.7885,
1223
- "rewards/accuracies": 0.71875,
1224
- "rewards/chosen": -2.580803394317627,
1225
- "rewards/margins": 1.19155752658844,
1226
- "rewards/rejected": -3.7723612785339355,
1227
- "step": 780
1228
- },
1229
- {
1230
- "epoch": 0.83,
1231
- "learning_rate": 4.415420150605398e-08,
1232
- "logits/chosen": 2.267085313796997,
1233
- "logits/rejected": 3.2141170501708984,
1234
- "logps/chosen": -574.6113891601562,
1235
- "logps/rejected": -717.8250122070312,
1236
- "loss": 0.7567,
1237
- "rewards/accuracies": 0.793749988079071,
1238
- "rewards/chosen": -2.7426769733428955,
1239
- "rewards/margins": 1.5901119709014893,
1240
- "rewards/rejected": -4.332788467407227,
1241
- "step": 790
1242
- },
1243
- {
1244
- "epoch": 0.84,
1245
- "learning_rate": 3.9104390285376374e-08,
1246
- "logits/chosen": 1.8068571090698242,
1247
- "logits/rejected": 3.51857328414917,
1248
- "logps/chosen": -589.93505859375,
1249
- "logps/rejected": -680.4630126953125,
1250
- "loss": 0.7455,
1251
- "rewards/accuracies": 0.768750011920929,
1252
- "rewards/chosen": -2.487300395965576,
1253
- "rewards/margins": 1.4988832473754883,
1254
- "rewards/rejected": -3.9861836433410645,
1255
- "step": 800
1256
- },
1257
- {
1258
- "epoch": 0.84,
1259
- "eval_logits/chosen": 1.9905518293380737,
1260
- "eval_logits/rejected": 3.243948459625244,
1261
- "eval_logps/chosen": -526.2843627929688,
1262
- "eval_logps/rejected": -643.9625854492188,
1263
- "eval_loss": 0.7792153358459473,
1264
- "eval_rewards/accuracies": 0.7757936716079712,
1265
- "eval_rewards/chosen": -2.421745777130127,
1266
- "eval_rewards/margins": 1.4000948667526245,
1267
- "eval_rewards/rejected": -3.821840286254883,
1268
- "eval_runtime": 242.9784,
1269
- "eval_samples_per_second": 8.231,
1270
- "eval_steps_per_second": 0.259,
1271
- "step": 800
1272
- },
1273
- {
1274
- "epoch": 0.85,
1275
- "learning_rate": 3.433663324986208e-08,
1276
- "logits/chosen": 2.3890461921691895,
1277
- "logits/rejected": 3.671307325363159,
1278
- "logps/chosen": -535.8336791992188,
1279
- "logps/rejected": -607.342041015625,
1280
- "loss": 0.7698,
1281
- "rewards/accuracies": 0.75,
1282
- "rewards/chosen": -2.5636343955993652,
1283
- "rewards/margins": 1.1194355487823486,
1284
- "rewards/rejected": -3.683069944381714,
1285
- "step": 810
1286
- },
1287
- {
1288
- "epoch": 0.86,
1289
- "learning_rate": 2.9857306851953897e-08,
1290
- "logits/chosen": 2.8772475719451904,
1291
- "logits/rejected": 3.245588779449463,
1292
- "logps/chosen": -489.6429748535156,
1293
- "logps/rejected": -598.0228271484375,
1294
- "loss": 0.8105,
1295
- "rewards/accuracies": 0.75,
1296
- "rewards/chosen": -2.369723081588745,
1297
- "rewards/margins": 1.2586920261383057,
1298
- "rewards/rejected": -3.6284148693084717,
1299
- "step": 820
1300
- },
1301
- {
1302
- "epoch": 0.87,
1303
- "learning_rate": 2.567240179368185e-08,
1304
- "logits/chosen": 1.9207391738891602,
1305
- "logits/rejected": 3.2350220680236816,
1306
- "logps/chosen": -485.8853454589844,
1307
- "logps/rejected": -604.6241455078125,
1308
- "loss": 0.7663,
1309
- "rewards/accuracies": 0.768750011920929,
1310
- "rewards/chosen": -2.3702356815338135,
1311
- "rewards/margins": 1.2538180351257324,
1312
- "rewards/rejected": -3.624053955078125,
1313
- "step": 830
1314
- },
1315
- {
1316
- "epoch": 0.88,
1317
- "learning_rate": 2.1787515014630357e-08,
1318
- "logits/chosen": 2.4551303386688232,
1319
- "logits/rejected": 2.7984118461608887,
1320
- "logps/chosen": -548.1451416015625,
1321
- "logps/rejected": -633.4719848632812,
1322
- "loss": 0.8024,
1323
- "rewards/accuracies": 0.6937500238418579,
1324
- "rewards/chosen": -2.350025177001953,
1325
- "rewards/margins": 1.1445175409317017,
1326
- "rewards/rejected": -3.494542360305786,
1327
- "step": 840
1328
- },
1329
- {
1330
- "epoch": 0.89,
1331
- "learning_rate": 1.820784220652766e-08,
1332
- "logits/chosen": 2.1438848972320557,
1333
- "logits/rejected": 3.26000714302063,
1334
- "logps/chosen": -524.2879638671875,
1335
- "logps/rejected": -579.0330200195312,
1336
- "loss": 0.781,
1337
- "rewards/accuracies": 0.75,
1338
- "rewards/chosen": -2.2401034832000732,
1339
- "rewards/margins": 1.1002050638198853,
1340
- "rewards/rejected": -3.340308666229248,
1341
- "step": 850
1342
- },
1343
- {
1344
- "epoch": 0.9,
1345
- "learning_rate": 1.4938170864468636e-08,
1346
- "logits/chosen": 2.354217290878296,
1347
- "logits/rejected": 3.4680027961730957,
1348
- "logps/chosen": -520.9027099609375,
1349
- "logps/rejected": -637.2523193359375,
1350
- "loss": 0.7833,
1351
- "rewards/accuracies": 0.768750011920929,
1352
- "rewards/chosen": -2.4247303009033203,
1353
- "rewards/margins": 1.4387235641479492,
1354
- "rewards/rejected": -3.8634536266326904,
1355
- "step": 860
1356
- },
1357
- {
1358
- "epoch": 0.91,
1359
- "learning_rate": 1.1982873884064465e-08,
1360
- "logits/chosen": 2.0364556312561035,
1361
- "logits/rejected": 3.0106232166290283,
1362
- "logps/chosen": -446.37164306640625,
1363
- "logps/rejected": -599.5112915039062,
1364
- "loss": 0.7706,
1365
- "rewards/accuracies": 0.8125,
1366
- "rewards/chosen": -1.993843674659729,
1367
- "rewards/margins": 1.5384770631790161,
1368
- "rewards/rejected": -3.532320499420166,
1369
- "step": 870
1370
- },
1371
- {
1372
- "epoch": 0.92,
1373
- "learning_rate": 9.345903713082304e-09,
1374
- "logits/chosen": 1.718162178993225,
1375
- "logits/rejected": 3.142746686935425,
1376
- "logps/chosen": -520.4351806640625,
1377
- "logps/rejected": -618.0506591796875,
1378
- "loss": 0.7503,
1379
- "rewards/accuracies": 0.737500011920929,
1380
- "rewards/chosen": -2.463242769241333,
1381
- "rewards/margins": 1.1839332580566406,
1382
- "rewards/rejected": -3.6471760272979736,
1383
- "step": 880
1384
- },
1385
- {
1386
- "epoch": 0.93,
1387
- "learning_rate": 7.030787065396865e-09,
1388
- "logits/chosen": 1.782679796218872,
1389
- "logits/rejected": 3.1341471672058105,
1390
- "logps/chosen": -507.89044189453125,
1391
- "logps/rejected": -622.839111328125,
1392
- "loss": 0.8135,
1393
- "rewards/accuracies": 0.731249988079071,
1394
- "rewards/chosen": -2.4141266345977783,
1395
- "rewards/margins": 1.1595393419265747,
1396
- "rewards/rejected": -3.5736660957336426,
1397
- "step": 890
1398
- },
1399
- {
1400
- "epoch": 0.94,
1401
- "learning_rate": 5.04062020432286e-09,
1402
- "logits/chosen": 2.032198190689087,
1403
- "logits/rejected": 3.270922899246216,
1404
- "logps/chosen": -533.2232055664062,
1405
- "logps/rejected": -638.6824340820312,
1406
- "loss": 0.7785,
1407
- "rewards/accuracies": 0.706250011920929,
1408
- "rewards/chosen": -2.348451852798462,
1409
- "rewards/margins": 1.1957995891571045,
1410
- "rewards/rejected": -3.5442516803741455,
1411
- "step": 900
1412
- },
1413
- {
1414
- "epoch": 0.94,
1415
- "eval_logits/chosen": 1.8721567392349243,
1416
- "eval_logits/rejected": 3.0941007137298584,
1417
- "eval_logps/chosen": -513.8541870117188,
1418
- "eval_logps/rejected": -628.2752685546875,
1419
- "eval_loss": 0.7778856158256531,
1420
- "eval_rewards/accuracies": 0.7797619104385376,
1421
- "eval_rewards/chosen": -2.2974438667297363,
1422
- "eval_rewards/margins": 1.3675230741500854,
1423
- "eval_rewards/rejected": -3.6649670600891113,
1424
- "eval_runtime": 244.1274,
1425
- "eval_samples_per_second": 8.192,
1426
- "eval_steps_per_second": 0.258,
1427
- "step": 900
1428
- },
1429
- {
1430
- "epoch": 0.95,
1431
- "learning_rate": 3.3780648016376866e-09,
1432
- "logits/chosen": 2.228888988494873,
1433
- "logits/rejected": 3.4416561126708984,
1434
- "logps/chosen": -489.8466796875,
1435
- "logps/rejected": -613.7962646484375,
1436
- "loss": 0.7601,
1437
- "rewards/accuracies": 0.800000011920929,
1438
- "rewards/chosen": -2.4230568408966064,
1439
- "rewards/margins": 1.299936056137085,
1440
- "rewards/rejected": -3.7229926586151123,
1441
- "step": 910
1442
- },
1443
- {
1444
- "epoch": 0.96,
1445
- "learning_rate": 2.0453443778310766e-09,
1446
- "logits/chosen": 2.0826706886291504,
1447
- "logits/rejected": 3.181570529937744,
1448
- "logps/chosen": -532.4862670898438,
1449
- "logps/rejected": -636.2462158203125,
1450
- "loss": 0.7257,
1451
- "rewards/accuracies": 0.8062499761581421,
1452
- "rewards/chosen": -2.3680665493011475,
1453
- "rewards/margins": 1.4134373664855957,
1454
- "rewards/rejected": -3.7815041542053223,
1455
- "step": 920
1456
- },
1457
- {
1458
- "epoch": 0.97,
1459
- "learning_rate": 1.0442413283435758e-09,
1460
- "logits/chosen": 1.8148505687713623,
1461
- "logits/rejected": 2.911557674407959,
1462
- "logps/chosen": -503.92071533203125,
1463
- "logps/rejected": -622.6376953125,
1464
- "loss": 0.7243,
1465
- "rewards/accuracies": 0.84375,
1466
- "rewards/chosen": -2.3115131855010986,
1467
- "rewards/margins": 1.4749082326889038,
1468
- "rewards/rejected": -3.786421537399292,
1469
- "step": 930
1470
- },
1471
  {
1472
  "epoch": 0.98,
1473
- "learning_rate": 3.760945397705828e-10,
1474
- "logits/chosen": 2.1273417472839355,
1475
- "logits/rejected": 3.256913423538208,
1476
- "logps/chosen": -574.5153198242188,
1477
- "logps/rejected": -671.7332763671875,
1478
- "loss": 0.7477,
1479
- "rewards/accuracies": 0.7749999761581421,
1480
- "rewards/chosen": -2.447880268096924,
1481
- "rewards/margins": 1.3559364080429077,
1482
- "rewards/rejected": -3.803816556930542,
1483
- "step": 940
1484
- },
1485
- {
1486
- "epoch": 0.99,
1487
- "learning_rate": 4.17975992204056e-11,
1488
- "logits/chosen": 1.6933950185775757,
1489
- "logits/rejected": 3.3468966484069824,
1490
- "logps/chosen": -524.0973510742188,
1491
- "logps/rejected": -652.6331787109375,
1492
- "loss": 0.8027,
1493
- "rewards/accuracies": 0.7562500238418579,
1494
- "rewards/chosen": -2.3957927227020264,
1495
- "rewards/margins": 1.4984779357910156,
1496
- "rewards/rejected": -3.894270420074463,
1497
- "step": 950
1498
  },
1499
  {
1500
  "epoch": 1.0,
1501
- "step": 955,
1502
  "total_flos": 0.0,
1503
- "train_loss": 0.8416462149295507,
1504
- "train_runtime": 20662.0179,
1505
- "train_samples_per_second": 2.959,
1506
- "train_steps_per_second": 0.046
1507
  }
1508
  ],
1509
  "logging_steps": 10,
1510
- "max_steps": 955,
1511
  "num_input_tokens_seen": 0,
1512
  "num_train_epochs": 1,
1513
- "save_steps": 100000000,
1514
  "total_flos": 0.0,
1515
- "train_batch_size": 8,
1516
  "trial_name": null,
1517
  "trial_params": null
1518
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9990186457311089,
5
  "eval_steps": 100,
6
+ "global_step": 509,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 9.803921568627451e-09,
14
+ "logits/chosen": -2.7483465671539307,
15
+ "logits/rejected": -2.739339828491211,
16
+ "logps/chosen": -287.5325927734375,
17
+ "logps/rejected": -235.635986328125,
18
+ "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.02,
27
+ "learning_rate": 9.80392156862745e-08,
28
+ "logits/chosen": -2.709578037261963,
29
+ "logits/rejected": -2.7113540172576904,
30
+ "logps/chosen": -260.56292724609375,
31
+ "logps/rejected": -256.438232421875,
32
+ "loss": 0.6932,
33
+ "rewards/accuracies": 0.4194444417953491,
34
+ "rewards/chosen": 0.00014394157915376127,
35
+ "rewards/margins": 1.0432106591906631e-06,
36
+ "rewards/rejected": 0.00014289839600678533,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.04,
41
+ "learning_rate": 1.96078431372549e-07,
42
+ "logits/chosen": -2.728665828704834,
43
+ "logits/rejected": -2.7061820030212402,
44
+ "logps/chosen": -280.0662536621094,
45
+ "logps/rejected": -254.76626586914062,
46
+ "loss": 0.6926,
47
+ "rewards/accuracies": 0.5724999904632568,
48
+ "rewards/chosen": -4.974007424607407e-06,
49
+ "rewards/margins": 0.0005589541979134083,
50
+ "rewards/rejected": -0.0005639282753691077,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.06,
55
+ "learning_rate": 2.941176470588235e-07,
56
+ "logits/chosen": -2.7290821075439453,
57
+ "logits/rejected": -2.742999315261841,
58
+ "logps/chosen": -279.2391357421875,
59
+ "logps/rejected": -253.37265014648438,
60
+ "loss": 0.6895,
61
+ "rewards/accuracies": 0.6349999904632568,
62
+ "rewards/chosen": 0.0049138437025249004,
63
+ "rewards/margins": 0.007674422115087509,
64
+ "rewards/rejected": -0.002760578179731965,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.08,
69
+ "learning_rate": 3.92156862745098e-07,
70
+ "logits/chosen": -2.7134017944335938,
71
+ "logits/rejected": -2.698641777038574,
72
+ "logps/chosen": -274.20147705078125,
73
+ "logps/rejected": -255.8253936767578,
74
+ "loss": 0.6782,
75
+ "rewards/accuracies": 0.6924999952316284,
76
+ "rewards/chosen": 0.0260241087526083,
77
+ "rewards/margins": 0.026919733732938766,
78
+ "rewards/rejected": -0.0008956241654232144,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.1,
83
+ "learning_rate": 4.901960784313725e-07,
84
+ "logits/chosen": -2.6435346603393555,
85
+ "logits/rejected": -2.6110424995422363,
86
+ "logps/chosen": -302.06768798828125,
87
+ "logps/rejected": -261.10919189453125,
88
+ "loss": 0.6612,
89
+ "rewards/accuracies": 0.7124999761581421,
90
+ "rewards/chosen": 0.023571131750941277,
91
+ "rewards/margins": 0.07649616152048111,
92
+ "rewards/rejected": -0.05292503535747528,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.12,
97
+ "learning_rate": 4.995237599803335e-07,
98
+ "logits/chosen": -2.6205055713653564,
99
+ "logits/rejected": -2.5843255519866943,
100
+ "logps/chosen": -300.914306640625,
101
+ "logps/rejected": -286.0216064453125,
102
+ "loss": 0.6451,
103
  "rewards/accuracies": 0.6625000238418579,
104
+ "rewards/chosen": -0.05583832785487175,
105
+ "rewards/margins": 0.11994686722755432,
106
+ "rewards/rejected": -0.17578519880771637,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.14,
111
+ "learning_rate": 4.978798275112142e-07,
112
+ "logits/chosen": -2.607668161392212,
113
+ "logits/rejected": -2.568187952041626,
114
+ "logps/chosen": -308.4685974121094,
115
+ "logps/rejected": -305.6259460449219,
116
+ "loss": 0.6212,
117
+ "rewards/accuracies": 0.675000011920929,
118
+ "rewards/chosen": -0.1777888685464859,
119
+ "rewards/margins": 0.19118839502334595,
120
+ "rewards/rejected": -0.3689771890640259,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.16,
125
+ "learning_rate": 4.950700530747689e-07,
126
+ "logits/chosen": -2.6067116260528564,
127
+ "logits/rejected": -2.5767879486083984,
128
+ "logps/chosen": -300.19488525390625,
129
+ "logps/rejected": -295.8065185546875,
130
+ "loss": 0.6196,
131
+ "rewards/accuracies": 0.6850000023841858,
132
+ "rewards/chosen": -0.13195012509822845,
133
+ "rewards/margins": 0.25833892822265625,
134
+ "rewards/rejected": -0.3902890384197235,
135
  "step": 80
136
  },
137
  {
138
+ "epoch": 0.18,
139
+ "learning_rate": 4.911076517558622e-07,
140
+ "logits/chosen": -2.5809831619262695,
141
+ "logits/rejected": -2.555103302001953,
142
+ "logps/chosen": -325.28692626953125,
143
+ "logps/rejected": -330.8323974609375,
144
+ "loss": 0.5844,
145
+ "rewards/accuracies": 0.7300000190734863,
146
+ "rewards/chosen": -0.21861158311367035,
147
+ "rewards/margins": 0.3220059275627136,
148
+ "rewards/rejected": -0.5406175851821899,
149
  "step": 90
150
  },
151
  {
152
+ "epoch": 0.2,
153
+ "learning_rate": 4.860112597371772e-07,
154
+ "logits/chosen": -2.5413742065429688,
155
+ "logits/rejected": -2.5363407135009766,
156
+ "logps/chosen": -295.8542175292969,
157
+ "logps/rejected": -310.6338195800781,
158
+ "loss": 0.5764,
159
+ "rewards/accuracies": 0.6675000190734863,
160
+ "rewards/chosen": -0.26630619168281555,
161
+ "rewards/margins": 0.3358945846557617,
162
+ "rewards/rejected": -0.6022006869316101,
163
  "step": 100
164
  },
165
  {
166
+ "epoch": 0.2,
167
+ "eval_logits/chosen": -2.4791219234466553,
168
+ "eval_logits/rejected": -2.4360005855560303,
169
+ "eval_logps/chosen": -313.6502990722656,
170
+ "eval_logps/rejected": -340.86053466796875,
171
+ "eval_loss": 0.5828901529312134,
172
+ "eval_rewards/accuracies": 0.6931137442588806,
173
+ "eval_rewards/chosen": -0.3592246174812317,
174
+ "eval_rewards/margins": 0.40203189849853516,
175
+ "eval_rewards/rejected": -0.7612565159797668,
176
+ "eval_runtime": 494.2516,
177
+ "eval_samples_per_second": 4.047,
178
+ "eval_steps_per_second": 0.338,
179
  "step": 100
180
  },
181
  {
182
+ "epoch": 0.22,
183
+ "learning_rate": 4.798048466485017e-07,
184
+ "logits/chosen": -2.0916123390197754,
185
+ "logits/rejected": -2.1291110515594482,
186
+ "logps/chosen": -337.0193786621094,
187
+ "logps/rejected": -372.4815368652344,
188
+ "loss": 0.5665,
189
+ "rewards/accuracies": 0.7124999761581421,
190
+ "rewards/chosen": -0.6119796633720398,
191
+ "rewards/margins": 0.5584384799003601,
192
+ "rewards/rejected": -1.1704181432724,
193
  "step": 110
194
  },
195
  {
196
+ "epoch": 0.24,
197
+ "learning_rate": 4.725176028314541e-07,
198
+ "logits/chosen": -1.8370585441589355,
199
+ "logits/rejected": -1.7712280750274658,
200
+ "logps/chosen": -370.1864318847656,
201
+ "logps/rejected": -398.8289794921875,
202
+ "loss": 0.56,
203
+ "rewards/accuracies": 0.7350000143051147,
204
+ "rewards/chosen": -0.8116917610168457,
205
+ "rewards/margins": 0.6380540728569031,
206
+ "rewards/rejected": -1.449745774269104,
207
  "step": 120
208
  },
209
  {
210
+ "epoch": 0.26,
211
+ "learning_rate": 4.641838020498713e-07,
212
+ "logits/chosen": -1.7485500574111938,
213
+ "logits/rejected": -1.5671393871307373,
214
+ "logps/chosen": -380.29913330078125,
215
+ "logps/rejected": -424.1035461425781,
216
+ "loss": 0.5461,
217
+ "rewards/accuracies": 0.7200000286102295,
218
+ "rewards/chosen": -0.8717474937438965,
219
+ "rewards/margins": 0.6444628834724426,
220
+ "rewards/rejected": -1.5162103176116943,
221
  "step": 130
222
  },
223
  {
224
+ "epoch": 0.27,
225
+ "learning_rate": 4.5484264029156733e-07,
226
+ "logits/chosen": -1.9667887687683105,
227
+ "logits/rejected": -1.6983026266098022,
228
+ "logps/chosen": -322.9972839355469,
229
+ "logps/rejected": -379.5963134765625,
230
+ "loss": 0.5416,
231
+ "rewards/accuracies": 0.7149999737739563,
232
+ "rewards/chosen": -0.6348860263824463,
233
+ "rewards/margins": 0.6040786504745483,
234
+ "rewards/rejected": -1.2389646768569946,
235
  "step": 140
236
  },
237
  {
238
+ "epoch": 0.29,
239
+ "learning_rate": 4.445380514196192e-07,
240
+ "logits/chosen": -1.2058897018432617,
241
+ "logits/rejected": -0.9969528317451477,
242
+ "logps/chosen": -379.3441467285156,
243
+ "logps/rejected": -449.9009704589844,
244
+ "loss": 0.5485,
245
+ "rewards/accuracies": 0.737500011920929,
246
+ "rewards/chosen": -0.9173200726509094,
247
+ "rewards/margins": 0.7758927941322327,
248
+ "rewards/rejected": -1.6932127475738525,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 0.31,
253
+ "learning_rate": 4.33318500540218e-07,
254
+ "logits/chosen": -1.7521625757217407,
255
+ "logits/rejected": -1.4877443313598633,
256
+ "logps/chosen": -356.1580810546875,
257
+ "logps/rejected": -389.0058288574219,
258
+ "loss": 0.5183,
259
+ "rewards/accuracies": 0.7850000262260437,
260
+ "rewards/chosen": -0.6841800212860107,
261
+ "rewards/margins": 0.7851129174232483,
262
+ "rewards/rejected": -1.4692928791046143,
263
  "step": 160
264
  },
265
  {
266
+ "epoch": 0.33,
267
+ "learning_rate": 4.2123675605892985e-07,
268
+ "logits/chosen": -1.6861900091171265,
269
+ "logits/rejected": -1.4684306383132935,
270
+ "logps/chosen": -379.7774658203125,
271
+ "logps/rejected": -437.3900451660156,
272
+ "loss": 0.5146,
273
+ "rewards/accuracies": 0.7300000190734863,
274
+ "rewards/chosen": -0.8159699440002441,
275
+ "rewards/margins": 0.7220683097839355,
276
+ "rewards/rejected": -1.5380383729934692,
277
  "step": 170
278
  },
279
  {
280
+ "epoch": 0.35,
281
+ "learning_rate": 4.0834964149744333e-07,
282
+ "logits/chosen": -1.3343206644058228,
283
+ "logits/rejected": -1.0179518461227417,
284
+ "logps/chosen": -358.3331298828125,
285
+ "logps/rejected": -399.9204406738281,
286
+ "loss": 0.5536,
287
+ "rewards/accuracies": 0.7074999809265137,
288
+ "rewards/chosen": -0.8257815837860107,
289
+ "rewards/margins": 0.7000215649604797,
290
+ "rewards/rejected": -1.5258032083511353,
291
  "step": 180
292
  },
293
  {
294
+ "epoch": 0.37,
295
+ "learning_rate": 3.947177682380738e-07,
296
+ "logits/chosen": -1.2010215520858765,
297
+ "logits/rejected": -0.8926857709884644,
298
+ "logps/chosen": -375.1010437011719,
299
+ "logps/rejected": -433.2417297363281,
300
+ "loss": 0.5309,
301
+ "rewards/accuracies": 0.7425000071525574,
302
+ "rewards/chosen": -0.7876387238502502,
303
+ "rewards/margins": 0.7681831121444702,
304
+ "rewards/rejected": -1.5558221340179443,
305
  "step": 190
306
  },
307
  {
308
+ "epoch": 0.39,
309
+ "learning_rate": 3.804052504529933e-07,
310
+ "logits/chosen": -1.1186742782592773,
311
+ "logits/rejected": -0.7032889723777771,
312
+ "logps/chosen": -351.2778625488281,
313
+ "logps/rejected": -416.71820068359375,
314
+ "loss": 0.5169,
315
+ "rewards/accuracies": 0.7475000023841858,
316
+ "rewards/chosen": -0.7259469032287598,
317
+ "rewards/margins": 0.874809741973877,
318
+ "rewards/rejected": -1.6007568836212158,
319
  "step": 200
320
  },
321
  {
322
+ "epoch": 0.39,
323
+ "eval_logits/chosen": -1.201006293296814,
324
+ "eval_logits/rejected": -0.8443379402160645,
325
+ "eval_logps/chosen": -366.2012023925781,
326
+ "eval_logps/rejected": -426.77203369140625,
327
+ "eval_loss": 0.531209409236908,
328
+ "eval_rewards/accuracies": 0.7065868377685547,
329
+ "eval_rewards/chosen": -0.8847335577011108,
330
+ "eval_rewards/margins": 0.7356376647949219,
331
+ "eval_rewards/rejected": -1.6203712224960327,
332
+ "eval_runtime": 494.1792,
333
+ "eval_samples_per_second": 4.047,
334
+ "eval_steps_per_second": 0.338,
335
  "step": 200
336
  },
337
  {
338
+ "epoch": 0.41,
339
+ "learning_rate": 3.654794035589483e-07,
340
+ "logits/chosen": -0.9955520629882812,
341
+ "logits/rejected": -0.5436328649520874,
342
+ "logps/chosen": -402.7477722167969,
343
+ "logps/rejected": -444.9473876953125,
344
+ "loss": 0.5126,
345
+ "rewards/accuracies": 0.7225000262260437,
346
+ "rewards/chosen": -1.0243951082229614,
347
+ "rewards/margins": 0.7689486742019653,
348
+ "rewards/rejected": -1.7933436632156372,
349
  "step": 210
350
  },
351
  {
352
+ "epoch": 0.43,
353
+ "learning_rate": 3.5001042761570826e-07,
354
+ "logits/chosen": -0.7878814935684204,
355
+ "logits/rejected": -0.33438754081726074,
356
+ "logps/chosen": -379.41448974609375,
357
+ "logps/rejected": -452.28009033203125,
358
+ "loss": 0.5159,
359
+ "rewards/accuracies": 0.7475000023841858,
360
+ "rewards/chosen": -1.0701900720596313,
361
+ "rewards/margins": 0.8491780161857605,
362
+ "rewards/rejected": -1.919368028640747,
363
  "step": 220
364
  },
365
  {
366
+ "epoch": 0.45,
367
+ "learning_rate": 3.34071077157304e-07,
368
+ "logits/chosen": -0.6851831078529358,
369
+ "logits/rejected": -0.29147180914878845,
370
+ "logps/chosen": -360.47869873046875,
371
+ "logps/rejected": -406.3958740234375,
372
+ "loss": 0.5399,
373
+ "rewards/accuracies": 0.7149999737739563,
374
+ "rewards/chosen": -0.9100778698921204,
375
+ "rewards/margins": 0.7056692242622375,
376
+ "rewards/rejected": -1.6157469749450684,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 0.47,
381
+ "learning_rate": 3.1773631900892204e-07,
382
+ "logits/chosen": -0.6293848752975464,
383
+ "logits/rejected": -0.2972988784313202,
384
+ "logps/chosen": -364.2557067871094,
385
+ "logps/rejected": -426.8414306640625,
386
+ "loss": 0.5184,
387
+ "rewards/accuracies": 0.75,
388
+ "rewards/chosen": -0.945137083530426,
389
+ "rewards/margins": 0.7834777235984802,
390
+ "rewards/rejected": -1.7286149263381958,
391
  "step": 240
392
  },
393
  {
394
+ "epoch": 0.49,
395
+ "learning_rate": 3.0108297969883103e-07,
396
+ "logits/chosen": -0.6830095052719116,
397
+ "logits/rejected": -0.20727473497390747,
398
+ "logps/chosen": -377.15960693359375,
399
+ "logps/rejected": -440.8514709472656,
400
+ "loss": 0.5199,
401
+ "rewards/accuracies": 0.7475000023841858,
402
+ "rewards/chosen": -0.9253360033035278,
403
+ "rewards/margins": 0.7137148380279541,
404
+ "rewards/rejected": -1.6390507221221924,
405
  "step": 250
406
  },
407
  {
408
+ "epoch": 0.51,
409
+ "learning_rate": 2.8418938412365013e-07,
410
+ "logits/chosen": -0.595008909702301,
411
+ "logits/rejected": -0.22117982804775238,
412
+ "logps/chosen": -378.3102722167969,
413
+ "logps/rejected": -421.2056884765625,
414
+ "loss": 0.5259,
415
+ "rewards/accuracies": 0.699999988079071,
416
+ "rewards/chosen": -1.0280470848083496,
417
+ "rewards/margins": 0.6548060774803162,
418
+ "rewards/rejected": -1.682853102684021,
419
  "step": 260
420
  },
421
  {
422
+ "epoch": 0.53,
423
+ "learning_rate": 2.671349871664101e-07,
424
+ "logits/chosen": -0.4738517105579376,
425
+ "logits/rejected": -0.06301561743021011,
426
+ "logps/chosen": -391.0889892578125,
427
+ "logps/rejected": -433.60174560546875,
428
+ "loss": 0.4996,
429
+ "rewards/accuracies": 0.7749999761581421,
430
+ "rewards/chosen": -0.925932765007019,
431
+ "rewards/margins": 0.8979344367980957,
432
+ "rewards/rejected": -1.8238672018051147,
433
  "step": 270
434
  },
435
  {
436
+ "epoch": 0.55,
437
+ "learning_rate": 2.5e-07,
438
+ "logits/chosen": -0.29330724477767944,
439
+ "logits/rejected": 0.11182761192321777,
440
+ "logps/chosen": -400.1533203125,
441
+ "logps/rejected": -453.4571228027344,
442
+ "loss": 0.5108,
443
+ "rewards/accuracies": 0.7174999713897705,
444
+ "rewards/chosen": -1.1598564386367798,
445
+ "rewards/margins": 0.7635893821716309,
446
+ "rewards/rejected": -1.9234455823898315,
447
  "step": 280
448
  },
449
  {
450
+ "epoch": 0.57,
451
+ "learning_rate": 2.3286501283358982e-07,
452
+ "logits/chosen": -0.049084682017564774,
453
+ "logits/rejected": 0.32071781158447266,
454
+ "logps/chosen": -421.474853515625,
455
+ "logps/rejected": -480.5507507324219,
456
+ "loss": 0.5107,
457
+ "rewards/accuracies": 0.75,
458
+ "rewards/chosen": -1.2823936939239502,
459
+ "rewards/margins": 0.920534610748291,
460
+ "rewards/rejected": -2.202928304672241,
461
  "step": 290
462
  },
463
  {
464
+ "epoch": 0.59,
465
+ "learning_rate": 2.1581061587634987e-07,
466
+ "logits/chosen": -0.3210409879684448,
467
+ "logits/rejected": 0.13426151871681213,
468
+ "logps/chosen": -392.66351318359375,
469
+ "logps/rejected": -457.4385681152344,
470
+ "loss": 0.5133,
471
+ "rewards/accuracies": 0.7825000286102295,
472
+ "rewards/chosen": -1.2225959300994873,
473
+ "rewards/margins": 0.9219253659248352,
474
+ "rewards/rejected": -2.1445212364196777,
475
  "step": 300
476
  },
477
  {
478
+ "epoch": 0.59,
479
+ "eval_logits/chosen": -0.38526856899261475,
480
+ "eval_logits/rejected": 0.0459565594792366,
481
+ "eval_logps/chosen": -396.590576171875,
482
+ "eval_logps/rejected": -460.7764892578125,
483
+ "eval_loss": 0.5159304141998291,
484
+ "eval_rewards/accuracies": 0.7245509028434753,
485
+ "eval_rewards/chosen": -1.1886271238327026,
486
+ "eval_rewards/margins": 0.7717891931533813,
487
+ "eval_rewards/rejected": -1.9604166746139526,
488
+ "eval_runtime": 494.4328,
489
+ "eval_samples_per_second": 4.045,
490
+ "eval_steps_per_second": 0.338,
491
  "step": 300
492
  },
493
  {
494
+ "epoch": 0.61,
495
+ "learning_rate": 1.9891702030116897e-07,
496
+ "logits/chosen": -0.6406633257865906,
497
+ "logits/rejected": 0.15507885813713074,
498
+ "logps/chosen": -384.56219482421875,
499
+ "logps/rejected": -443.3284912109375,
500
+ "loss": 0.5192,
501
+ "rewards/accuracies": 0.7599999904632568,
502
+ "rewards/chosen": -1.066334843635559,
503
+ "rewards/margins": 0.8297566175460815,
504
+ "rewards/rejected": -1.8960914611816406,
505
  "step": 310
506
  },
507
  {
508
+ "epoch": 0.63,
509
+ "learning_rate": 1.8226368099107792e-07,
510
+ "logits/chosen": -0.6926136016845703,
511
+ "logits/rejected": -0.09604160487651825,
512
+ "logps/chosen": -414.7826232910156,
513
+ "logps/rejected": -454.5480041503906,
514
+ "loss": 0.5065,
515
+ "rewards/accuracies": 0.7250000238418579,
516
+ "rewards/chosen": -1.0457278490066528,
517
+ "rewards/margins": 0.7350744605064392,
518
+ "rewards/rejected": -1.7808022499084473,
519
  "step": 320
520
  },
521
  {
522
+ "epoch": 0.65,
523
+ "learning_rate": 1.6592892284269594e-07,
524
+ "logits/chosen": -0.5141594409942627,
525
+ "logits/rejected": 0.11050853878259659,
526
+ "logps/chosen": -402.63348388671875,
527
+ "logps/rejected": -431.8319091796875,
528
+ "loss": 0.5093,
529
+ "rewards/accuracies": 0.737500011920929,
530
+ "rewards/chosen": -1.0640606880187988,
531
+ "rewards/margins": 0.7925867438316345,
532
+ "rewards/rejected": -1.8566473722457886,
533
  "step": 330
534
  },
535
  {
536
+ "epoch": 0.67,
537
+ "learning_rate": 1.4998957238429172e-07,
538
+ "logits/chosen": -0.08297364413738251,
539
+ "logits/rejected": 0.21859808266162872,
540
+ "logps/chosen": -390.8412170410156,
541
+ "logps/rejected": -461.3310546875,
542
+ "loss": 0.505,
543
+ "rewards/accuracies": 0.7275000214576721,
544
+ "rewards/chosen": -1.190333604812622,
545
+ "rewards/margins": 0.8922053575515747,
546
+ "rewards/rejected": -2.0825393199920654,
547
  "step": 340
548
  },
549
  {
550
+ "epoch": 0.69,
551
+ "learning_rate": 1.345205964410517e-07,
552
+ "logits/chosen": -0.539190948009491,
553
+ "logits/rejected": -0.053236301988363266,
554
+ "logps/chosen": -392.14385986328125,
555
+ "logps/rejected": -447.09844970703125,
556
+ "loss": 0.5125,
557
+ "rewards/accuracies": 0.7774999737739563,
558
+ "rewards/chosen": -0.9940242767333984,
559
+ "rewards/margins": 0.9291434288024902,
560
+ "rewards/rejected": -1.9231675863265991,
561
  "step": 350
562
  },
563
  {
564
+ "epoch": 0.71,
565
+ "learning_rate": 1.1959474954700665e-07,
566
+ "logits/chosen": -0.6150873303413391,
567
+ "logits/rejected": -0.08470536023378372,
568
+ "logps/chosen": -377.5425109863281,
569
+ "logps/rejected": -434.1069030761719,
570
+ "loss": 0.5266,
571
+ "rewards/accuracies": 0.7599999904632568,
572
+ "rewards/chosen": -1.0171641111373901,
573
+ "rewards/margins": 0.7864332795143127,
574
+ "rewards/rejected": -1.803597092628479,
575
  "step": 360
576
  },
577
  {
578
+ "epoch": 0.73,
579
+ "learning_rate": 1.0528223176192615e-07,
580
+ "logits/chosen": -0.464309424161911,
581
+ "logits/rejected": 0.11655576527118683,
582
+ "logps/chosen": -397.9951477050781,
583
+ "logps/rejected": -446.141845703125,
584
+ "loss": 0.4885,
585
+ "rewards/accuracies": 0.7350000143051147,
586
+ "rewards/chosen": -1.1220192909240723,
587
+ "rewards/margins": 0.7690063714981079,
588
+ "rewards/rejected": -1.8910256624221802,
589
  "step": 370
590
  },
591
  {
592
+ "epoch": 0.75,
593
+ "learning_rate": 9.16503585025567e-08,
594
+ "logits/chosen": -0.3131292462348938,
595
+ "logits/rejected": 0.1059599220752716,
596
+ "logps/chosen": -398.6189880371094,
597
+ "logps/rejected": -455.5489807128906,
598
+ "loss": 0.4785,
599
+ "rewards/accuracies": 0.7774999737739563,
600
+ "rewards/chosen": -1.180424451828003,
601
+ "rewards/margins": 0.9602058529853821,
602
+ "rewards/rejected": -2.1406302452087402,
603
  "step": 380
604
  },
605
  {
606
+ "epoch": 0.77,
607
+ "learning_rate": 7.876324394107017e-08,
608
+ "logits/chosen": -0.06371825933456421,
609
+ "logits/rejected": 0.4222162663936615,
610
+ "logps/chosen": -408.15203857421875,
611
+ "logps/rejected": -469.3525085449219,
612
+ "loss": 0.4945,
613
+ "rewards/accuracies": 0.7774999737739563,
614
+ "rewards/chosen": -1.2744272947311401,
615
+ "rewards/margins": 0.8693990111351013,
616
+ "rewards/rejected": -2.1438262462615967,
617
  "step": 390
618
  },
619
  {
620
+ "epoch": 0.79,
621
+ "learning_rate": 6.668149945978201e-08,
622
+ "logits/chosen": -0.4337286353111267,
623
+ "logits/rejected": 0.11450805515050888,
624
+ "logps/chosen": -406.1577453613281,
625
+ "logps/rejected": -468.1871337890625,
626
+ "loss": 0.4968,
627
+ "rewards/accuracies": 0.7574999928474426,
628
+ "rewards/chosen": -1.204884648323059,
629
+ "rewards/margins": 0.9240193367004395,
630
+ "rewards/rejected": -2.128904104232788,
631
  "step": 400
632
  },
633
  {
634
+ "epoch": 0.79,
635
+ "eval_logits/chosen": -0.2552393972873688,
636
+ "eval_logits/rejected": 0.20138485729694366,
637
+ "eval_logps/chosen": -402.1766357421875,
638
+ "eval_logps/rejected": -475.3639221191406,
639
+ "eval_loss": 0.5057728290557861,
640
+ "eval_rewards/accuracies": 0.7140718698501587,
641
+ "eval_rewards/chosen": -1.2444883584976196,
642
+ "eval_rewards/margins": 0.8618020415306091,
643
+ "eval_rewards/rejected": -2.106290578842163,
644
+ "eval_runtime": 493.9837,
645
+ "eval_samples_per_second": 4.049,
646
+ "eval_steps_per_second": 0.338,
647
  "step": 400
648
  },
649
  {
650
+ "epoch": 0.8,
651
+ "learning_rate": 5.546194858038072e-08,
652
+ "logits/chosen": -0.3444100618362427,
653
+ "logits/rejected": 0.08428356051445007,
654
+ "logps/chosen": -419.0089111328125,
655
+ "logps/rejected": -482.5577392578125,
656
+ "loss": 0.488,
657
+ "rewards/accuracies": 0.7325000166893005,
658
+ "rewards/chosen": -1.1570134162902832,
659
+ "rewards/margins": 0.9088660478591919,
660
+ "rewards/rejected": -2.0658795833587646,
661
  "step": 410
662
  },
663
  {
664
+ "epoch": 0.82,
665
+ "learning_rate": 4.5157359708432626e-08,
666
+ "logits/chosen": -0.3363034129142761,
667
+ "logits/rejected": 0.1421819031238556,
668
+ "logps/chosen": -417.26116943359375,
669
+ "logps/rejected": -475.9188537597656,
670
+ "loss": 0.5012,
671
+ "rewards/accuracies": 0.7549999952316284,
672
+ "rewards/chosen": -1.1876376867294312,
673
+ "rewards/margins": 0.9119570255279541,
674
+ "rewards/rejected": -2.0995945930480957,
675
  "step": 420
676
  },
677
  {
678
+ "epoch": 0.84,
679
+ "learning_rate": 3.581619795012874e-08,
680
+ "logits/chosen": -0.4450594186782837,
681
+ "logits/rejected": 0.03785795345902443,
682
+ "logps/chosen": -404.95281982421875,
683
+ "logps/rejected": -467.25531005859375,
684
+ "loss": 0.4861,
685
+ "rewards/accuracies": 0.7724999785423279,
686
+ "rewards/chosen": -1.1584584712982178,
687
+ "rewards/margins": 0.9622448086738586,
688
+ "rewards/rejected": -2.1207032203674316,
689
  "step": 430
690
  },
691
  {
692
+ "epoch": 0.86,
693
+ "learning_rate": 2.748239716854589e-08,
694
+ "logits/chosen": -0.31011733412742615,
695
+ "logits/rejected": 0.310569167137146,
696
+ "logps/chosen": -389.67132568359375,
697
+ "logps/rejected": -470.01104736328125,
698
+ "loss": 0.5105,
699
+ "rewards/accuracies": 0.7350000143051147,
700
+ "rewards/chosen": -1.1304560899734497,
701
+ "rewards/margins": 0.8861461877822876,
702
+ "rewards/rejected": -2.016602039337158,
703
  "step": 440
704
  },
705
  {
706
+ "epoch": 0.88,
707
+ "learning_rate": 2.0195153351498323e-08,
708
+ "logits/chosen": -0.3003827631473541,
709
+ "logits/rejected": 0.046957388520240784,
710
+ "logps/chosen": -412.5171203613281,
711
+ "logps/rejected": -481.26898193359375,
712
+ "loss": 0.5128,
713
+ "rewards/accuracies": 0.699999988079071,
714
+ "rewards/chosen": -1.1812173128128052,
715
+ "rewards/margins": 0.8305546641349792,
716
+ "rewards/rejected": -2.0117719173431396,
717
  "step": 450
718
  },
719
  {
720
+ "epoch": 0.9,
721
+ "learning_rate": 1.3988740262822846e-08,
722
+ "logits/chosen": -0.47582343220710754,
723
+ "logits/rejected": -0.11152289062738419,
724
+ "logps/chosen": -410.2917175292969,
725
+ "logps/rejected": -457.774658203125,
726
+ "loss": 0.5044,
727
+ "rewards/accuracies": 0.7649999856948853,
728
+ "rewards/chosen": -1.1460288763046265,
729
+ "rewards/margins": 0.8546761870384216,
730
+ "rewards/rejected": -2.0007050037384033,
731
  "step": 460
732
  },
733
  {
734
+ "epoch": 0.92,
735
+ "learning_rate": 8.892348244137788e-09,
736
+ "logits/chosen": -0.5770422220230103,
737
+ "logits/rejected": -0.025662722066044807,
738
+ "logps/chosen": -372.98187255859375,
739
+ "logps/rejected": -467.86199951171875,
740
+ "loss": 0.4973,
741
+ "rewards/accuracies": 0.7200000286102295,
742
+ "rewards/chosen": -1.0886142253875732,
743
+ "rewards/margins": 0.8808639049530029,
744
+ "rewards/rejected": -1.9694780111312866,
745
  "step": 470
746
  },
747
  {
748
+ "epoch": 0.94,
749
+ "learning_rate": 4.929946925231076e-09,
750
+ "logits/chosen": -0.5876446962356567,
751
+ "logits/rejected": -0.16365936398506165,
752
+ "logps/chosen": -400.3377685546875,
753
+ "logps/rejected": -455.9208068847656,
754
+ "loss": 0.5072,
755
+ "rewards/accuracies": 0.7024999856948853,
756
+ "rewards/chosen": -1.1451067924499512,
757
+ "rewards/margins": 0.7030719518661499,
758
+ "rewards/rejected": -1.848178744316101,
759
  "step": 480
760
  },
761
  {
762
+ "epoch": 0.96,
763
+ "learning_rate": 2.1201724887858484e-09,
764
+ "logits/chosen": -0.4430970847606659,
765
+ "logits/rejected": 0.12594802677631378,
766
+ "logps/chosen": -409.6846008300781,
767
+ "logps/rejected": -458.5526428222656,
768
+ "loss": 0.4887,
769
+ "rewards/accuracies": 0.7574999928474426,
770
+ "rewards/chosen": -1.0775573253631592,
771
+ "rewards/margins": 0.9305427074432373,
772
+ "rewards/rejected": -2.0081000328063965,
773
  "step": 490
774
  },
775
  {
776
+ "epoch": 0.98,
777
+ "learning_rate": 4.762400196664518e-10,
778
+ "logits/chosen": -0.41937455534935,
779
+ "logits/rejected": -0.08660510927438736,
780
+ "logps/chosen": -385.8563232421875,
781
+ "logps/rejected": -454.9473571777344,
782
+ "loss": 0.4833,
783
+ "rewards/accuracies": 0.7699999809265137,
784
+ "rewards/chosen": -1.093034267425537,
785
+ "rewards/margins": 0.9196186661720276,
786
+ "rewards/rejected": -2.012652635574341,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
  "step": 500
788
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  {
790
  "epoch": 0.98,
791
+ "eval_logits/chosen": -0.4496035575866699,
792
+ "eval_logits/rejected": 0.04359949380159378,
793
+ "eval_logps/chosen": -395.9374084472656,
794
+ "eval_logps/rejected": -470.5448303222656,
795
+ "eval_loss": 0.5045374631881714,
796
+ "eval_rewards/accuracies": 0.726047933101654,
797
+ "eval_rewards/chosen": -1.182096004486084,
798
+ "eval_rewards/margins": 0.876003086566925,
799
+ "eval_rewards/rejected": -2.0580990314483643,
800
+ "eval_runtime": 494.2334,
801
+ "eval_samples_per_second": 4.047,
802
+ "eval_steps_per_second": 0.338,
803
+ "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
804
  },
805
  {
806
  "epoch": 1.0,
807
+ "step": 509,
808
  "total_flos": 0.0,
809
+ "train_loss": 0.5401819272219315,
810
+ "train_runtime": 34352.758,
811
+ "train_samples_per_second": 1.78,
812
+ "train_steps_per_second": 0.015
813
  }
814
  ],
815
  "logging_steps": 10,
816
+ "max_steps": 509,
817
  "num_input_tokens_seen": 0,
818
  "num_train_epochs": 1,
819
+ "save_steps": 1000,
820
  "total_flos": 0.0,
821
+ "train_batch_size": 4,
822
  "trial_name": null,
823
  "trial_params": null
824
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bee5bf7183364b2486d76255411c15d45960d01e4698754b0948377ea6dc02e
3
- size 6072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4326671143f1f88098cb0c8ef537001a10fe9c8346367f54f87f1f4f56ca7627
3
+ size 5944