lole25 commited on
Commit
b80f3e8
1 Parent(s): 8314883

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - dpo
9
  - generated_from_trainer
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  base_model: microsoft/phi-2
13
  model-index:
14
  - name: phi-2-dpo-ultrachat-lora
@@ -20,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # phi-2-dpo-ultrachat-lora
22
 
23
- This model is a fine-tuned version of [lole25/phi-2-sft-ultrachat-lora](https://huggingface.co/lole25/phi-2-sft-ultrachat-lora) on the HuggingFaceH4/ultrafeedback_binarized dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 0.6932
26
- - Rewards/chosen: -0.0003
27
- - Rewards/rejected: -0.0002
28
- - Rewards/accuracies: 0.2480
29
- - Rewards/margins: -0.0000
30
- - Logps/rejected: -94.2209
31
- - Logps/chosen: -91.7559
32
- - Logits/rejected: 0.8201
33
- - Logits/chosen: 0.8023
34
 
35
  ## Model description
36
 
@@ -65,6 +61,12 @@ The following hyperparameters were used during training:
65
 
66
  ### Training results
67
 
 
 
 
 
 
 
68
 
69
 
70
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
8
  base_model: microsoft/phi-2
9
  model-index:
10
  - name: phi-2-dpo-ultrachat-lora
 
16
 
17
  # phi-2-dpo-ultrachat-lora
18
 
19
+ This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.6912
22
+ - Rewards/chosen: -0.0072
23
+ - Rewards/rejected: -0.0111
24
+ - Rewards/accuracies: 0.3180
25
+ - Rewards/margins: 0.0040
26
+ - Logps/rejected: -95.3090
27
+ - Logps/chosen: -92.4438
28
+ - Logits/rejected: 0.8021
29
+ - Logits/chosen: 0.7828
30
 
31
  ## Model description
32
 
 
61
 
62
  ### Training results
63
 
64
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.693 | 0.21 | 100 | 0.6931 | -0.0005 | -0.0008 | 0.2680 | 0.0004 | -94.2804 | -91.7748 | 0.8176 | 0.7998 |
67
+ | 0.6922 | 0.42 | 200 | 0.6924 | -0.0018 | -0.0032 | 0.3020 | 0.0014 | -94.5141 | -91.9068 | 0.8121 | 0.7941 |
68
+ | 0.6917 | 0.63 | 300 | 0.6917 | -0.0049 | -0.0077 | 0.3100 | 0.0028 | -94.9659 | -92.2189 | 0.8057 | 0.7870 |
69
+ | 0.6905 | 0.84 | 400 | 0.6913 | -0.0070 | -0.0105 | 0.3280 | 0.0036 | -95.2509 | -92.4247 | 0.8012 | 0.7827 |
70
 
71
 
72
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8dbddb9bc73f2f4265bd6b0698a74222fd439f2c917687f8a7722ed5576bc54
3
  size 41977616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:305ec4283375fd6264f9e6b7561a8941ac32582c9088af742777fac3c094eeba
3
  size 41977616
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
- "epoch": 0.99,
3
- "eval_logits/chosen": 0.8022828698158264,
4
- "eval_logits/rejected": 0.8200842142105103,
5
- "eval_logps/chosen": -91.75594329833984,
6
- "eval_logps/rejected": -94.22090911865234,
7
- "eval_loss": 0.6932105422019958,
8
- "eval_rewards/accuracies": 0.24799999594688416,
9
- "eval_rewards/chosen": -0.0002857264771591872,
10
- "eval_rewards/margins": -4.674374213209376e-05,
11
- "eval_rewards/rejected": -0.00023898274230305105,
12
- "eval_runtime": 273.9391,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 7.301,
15
- "eval_steps_per_second": 0.456,
16
- "train_loss": 0.6931231824975265,
17
- "train_runtime": 1147.8486,
18
- "train_samples": 6113,
19
- "train_samples_per_second": 5.326,
20
- "train_steps_per_second": 0.083
21
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": 0.7827913761138916,
4
+ "eval_logits/rejected": 0.8020623326301575,
5
+ "eval_logps/chosen": -92.44380187988281,
6
+ "eval_logps/rejected": -95.30902099609375,
7
+ "eval_loss": 0.6912217736244202,
8
+ "eval_rewards/accuracies": 0.3179999887943268,
9
+ "eval_rewards/chosen": -0.00716440798714757,
10
+ "eval_rewards/margins": 0.00395576748996973,
11
+ "eval_rewards/rejected": -0.011120175942778587,
12
+ "eval_runtime": 273.2522,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 7.319,
15
+ "eval_steps_per_second": 0.457,
16
+ "train_loss": 0.6920521804121805,
17
+ "train_runtime": 6813.2628,
18
+ "train_samples": 30567,
19
+ "train_samples_per_second": 4.486,
20
+ "train_steps_per_second": 0.07
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 0.99,
3
- "eval_logits/chosen": 0.8022828698158264,
4
- "eval_logits/rejected": 0.8200842142105103,
5
- "eval_logps/chosen": -91.75594329833984,
6
- "eval_logps/rejected": -94.22090911865234,
7
- "eval_loss": 0.6932105422019958,
8
- "eval_rewards/accuracies": 0.24799999594688416,
9
- "eval_rewards/chosen": -0.0002857264771591872,
10
- "eval_rewards/margins": -4.674374213209376e-05,
11
- "eval_rewards/rejected": -0.00023898274230305105,
12
- "eval_runtime": 273.9391,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 7.301,
15
- "eval_steps_per_second": 0.456
16
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": 0.7827913761138916,
4
+ "eval_logits/rejected": 0.8020623326301575,
5
+ "eval_logps/chosen": -92.44380187988281,
6
+ "eval_logps/rejected": -95.30902099609375,
7
+ "eval_loss": 0.6912217736244202,
8
+ "eval_rewards/accuracies": 0.3179999887943268,
9
+ "eval_rewards/chosen": -0.00716440798714757,
10
+ "eval_rewards/margins": 0.00395576748996973,
11
+ "eval_rewards/rejected": -0.011120175942778587,
12
+ "eval_runtime": 273.2522,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 7.319,
15
+ "eval_steps_per_second": 0.457
16
  }
runs/Mar01_07-41-58_gpu4-119-4/events.out.tfevents.1709239476.gpu4-119-4.1724210.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec52fa9532e89188839a74a31bed6a90207508d2806f4dd5c7a8d6e7cded9daa
3
- size 33628
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0914a4c2c529be24a3baad7b988b701bedae6634d92d5f0f86147dd98279695
3
+ size 38420
runs/Mar01_07-41-58_gpu4-119-4/events.out.tfevents.1709246562.gpu4-119-4.1724210.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2ce15b9319daaf97e9395bf9eabce614a4c92f3b9d1d8ac2b369d584227fa6f
3
+ size 828
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.99,
3
- "train_loss": 0.6931231824975265,
4
- "train_runtime": 1147.8486,
5
- "train_samples": 6113,
6
- "train_samples_per_second": 5.326,
7
- "train_steps_per_second": 0.083
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.6920521804121805,
4
+ "train_runtime": 6813.2628,
5
+ "train_samples": 30567,
6
+ "train_samples_per_second": 4.486,
7
+ "train_steps_per_second": 0.07
8
  }
trainer_state.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9921671018276762,
5
  "eval_steps": 100,
6
- "global_step": 95,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "learning_rate": 5.000000000000001e-07,
14
- "logits/chosen": 0.8826487064361572,
15
- "logits/rejected": 0.921362042427063,
16
- "logps/chosen": -36.58121871948242,
17
- "logps/rejected": -54.902320861816406,
18
  "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
@@ -23,143 +23,739 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.1,
27
- "learning_rate": 5e-06,
28
- "logits/chosen": 0.8917223811149597,
29
- "logits/rejected": 0.875190019607544,
30
- "logps/chosen": -87.82333374023438,
31
- "logps/rejected": -96.38070678710938,
32
- "loss": 0.6934,
33
  "rewards/accuracies": 0.2361111044883728,
34
- "rewards/chosen": -0.00021312937315087765,
35
- "rewards/margins": -0.00011523600551299751,
36
- "rewards/rejected": -9.78933458100073e-05,
37
  "step": 10
38
  },
39
  {
40
- "epoch": 0.21,
41
- "learning_rate": 4.83118057351089e-06,
42
- "logits/chosen": 0.780292272567749,
43
- "logits/rejected": 0.8473358154296875,
44
- "logps/chosen": -91.71416473388672,
45
- "logps/rejected": -85.1246566772461,
46
  "loss": 0.6932,
47
- "rewards/accuracies": 0.25,
48
- "rewards/chosen": 0.00043298042146489024,
49
- "rewards/margins": 0.00031665078131482005,
50
- "rewards/rejected": 0.00011632966197794303,
51
  "step": 20
52
  },
53
  {
54
- "epoch": 0.31,
55
- "learning_rate": 4.3475222930516484e-06,
56
- "logits/chosen": 0.8648103475570679,
57
- "logits/rejected": 0.8488438725471497,
58
- "logps/chosen": -85.26679992675781,
59
- "logps/rejected": -78.0839614868164,
60
- "loss": 0.6929,
61
- "rewards/accuracies": 0.3125,
62
- "rewards/chosen": 0.0005245368229225278,
63
- "rewards/margins": 0.00098425371106714,
64
- "rewards/rejected": -0.0004597169754561037,
65
  "step": 30
66
  },
67
  {
68
- "epoch": 0.42,
69
- "learning_rate": 3.6143458894413463e-06,
70
- "logits/chosen": 0.7656652927398682,
71
- "logits/rejected": 0.8423829078674316,
72
- "logps/chosen": -122.68544006347656,
73
- "logps/rejected": -108.7737045288086,
74
  "loss": 0.6931,
75
- "rewards/accuracies": 0.28125,
76
- "rewards/chosen": 0.00047112005995586514,
77
- "rewards/margins": 0.00047660223208367825,
78
- "rewards/rejected": -5.482271262735594e-06,
79
  "step": 40
80
  },
81
  {
82
- "epoch": 0.52,
83
- "learning_rate": 2.730670898658255e-06,
84
- "logits/chosen": 0.8398499488830566,
85
- "logits/rejected": 0.8686380386352539,
86
- "logps/chosen": -65.76744079589844,
87
- "logps/rejected": -71.65315246582031,
88
  "loss": 0.6932,
89
- "rewards/accuracies": 0.23749999701976776,
90
- "rewards/chosen": -0.00012343151320237666,
91
- "rewards/margins": -0.0002229490492027253,
92
- "rewards/rejected": 9.951753600034863e-05,
93
  "step": 50
94
  },
95
  {
96
- "epoch": 0.63,
97
- "learning_rate": 1.8158425248197931e-06,
98
- "logits/chosen": 0.814559817314148,
99
- "logits/rejected": 0.8853690028190613,
100
- "logps/chosen": -117.60276794433594,
101
- "logps/rejected": -110.95735168457031,
102
  "loss": 0.6931,
103
- "rewards/accuracies": 0.32499998807907104,
104
- "rewards/chosen": 0.0003024066681973636,
105
- "rewards/margins": 0.000562971574254334,
106
- "rewards/rejected": -0.0002605649351608008,
107
  "step": 60
108
  },
109
  {
110
- "epoch": 0.73,
111
- "learning_rate": 9.934134090518593e-07,
112
- "logits/chosen": 0.8051468133926392,
113
- "logits/rejected": 0.8624836802482605,
114
- "logps/chosen": -82.67887115478516,
115
- "logps/rejected": -83.9195327758789,
116
- "loss": 0.6932,
117
- "rewards/accuracies": 0.26249998807907104,
118
- "rewards/chosen": -0.00015662939404137433,
119
- "rewards/margins": -4.1384984797332436e-05,
120
- "rewards/rejected": -0.00011524439469212666,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.84,
125
- "learning_rate": 3.7445716067596506e-07,
126
- "logits/chosen": 0.7656813263893127,
127
- "logits/rejected": 0.8298920392990112,
128
- "logps/chosen": -105.78785705566406,
129
- "logps/rejected": -104.972900390625,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  "loss": 0.6931,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  "rewards/accuracies": 0.2750000059604645,
132
- "rewards/chosen": 0.0002620227460283786,
133
- "rewards/margins": 0.00032469426514580846,
134
- "rewards/rejected": -6.267154822126031e-05,
135
- "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  },
137
  {
138
  "epoch": 0.94,
139
- "learning_rate": 4.256725079024554e-08,
140
- "logits/chosen": 0.8350532650947571,
141
- "logits/rejected": 0.8562027215957642,
142
- "logps/chosen": -77.66747283935547,
143
- "logps/rejected": -90.00565338134766,
144
- "loss": 0.6929,
145
- "rewards/accuracies": 0.30000001192092896,
146
- "rewards/chosen": 0.00024019060947466642,
147
- "rewards/margins": 0.0006700255908071995,
148
- "rewards/rejected": -0.0004298349958844483,
149
- "step": 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  },
151
  {
152
- "epoch": 0.99,
153
- "step": 95,
154
  "total_flos": 0.0,
155
- "train_loss": 0.6931231824975265,
156
- "train_runtime": 1147.8486,
157
- "train_samples_per_second": 5.326,
158
- "train_steps_per_second": 0.083
159
  }
160
  ],
161
  "logging_steps": 10,
162
- "max_steps": 95,
163
  "num_input_tokens_seen": 0,
164
  "num_train_epochs": 1,
165
  "save_steps": 100,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9984301412872841,
5
  "eval_steps": 100,
6
+ "global_step": 477,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "learning_rate": 1.0416666666666667e-07,
14
+ "logits/chosen": 0.8531318306922913,
15
+ "logits/rejected": 0.7327959537506104,
16
+ "logps/chosen": -133.83494567871094,
17
+ "logps/rejected": -129.6299285888672,
18
  "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
 
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.02,
27
+ "learning_rate": 1.0416666666666667e-06,
28
+ "logits/chosen": 0.8151362538337708,
29
+ "logits/rejected": 0.7788704037666321,
30
+ "logps/chosen": -90.5947036743164,
31
+ "logps/rejected": -92.77328491210938,
32
+ "loss": 0.6931,
33
  "rewards/accuracies": 0.2361111044883728,
34
+ "rewards/chosen": 0.00018473717500455678,
35
+ "rewards/margins": 6.259369547478855e-05,
36
+ "rewards/rejected": 0.00012214347952976823,
37
  "step": 10
38
  },
39
  {
40
+ "epoch": 0.04,
41
+ "learning_rate": 2.0833333333333334e-06,
42
+ "logits/chosen": 0.8342723846435547,
43
+ "logits/rejected": 0.8797974586486816,
44
+ "logps/chosen": -81.16204071044922,
45
+ "logps/rejected": -81.1548080444336,
46
  "loss": 0.6932,
47
+ "rewards/accuracies": 0.17499999701976776,
48
+ "rewards/chosen": -0.00041853776201605797,
49
+ "rewards/margins": -0.0003428882628213614,
50
+ "rewards/rejected": -7.564939733128995e-05,
51
  "step": 20
52
  },
53
  {
54
+ "epoch": 0.06,
55
+ "learning_rate": 3.125e-06,
56
+ "logits/chosen": 0.8050435185432434,
57
+ "logits/rejected": 0.8031445741653442,
58
+ "logps/chosen": -101.43102264404297,
59
+ "logps/rejected": -98.3962631225586,
60
+ "loss": 0.6932,
61
+ "rewards/accuracies": 0.23749999701976776,
62
+ "rewards/chosen": 0.00019087354303337634,
63
+ "rewards/margins": -0.00028236288926564157,
64
+ "rewards/rejected": 0.000473236374091357,
65
  "step": 30
66
  },
67
  {
68
+ "epoch": 0.08,
69
+ "learning_rate": 4.166666666666667e-06,
70
+ "logits/chosen": 0.8352361917495728,
71
+ "logits/rejected": 0.8269574046134949,
72
+ "logps/chosen": -111.55462646484375,
73
+ "logps/rejected": -99.02640533447266,
74
  "loss": 0.6931,
75
+ "rewards/accuracies": 0.23749999701976776,
76
+ "rewards/chosen": -5.0148169975727797e-05,
77
+ "rewards/margins": 7.251273200381547e-05,
78
+ "rewards/rejected": -0.00012266085832379758,
79
  "step": 40
80
  },
81
  {
82
+ "epoch": 0.1,
83
+ "learning_rate": 4.999731868769027e-06,
84
+ "logits/chosen": 0.8326011896133423,
85
+ "logits/rejected": 0.8801982998847961,
86
+ "logps/chosen": -98.504638671875,
87
+ "logps/rejected": -83.65934753417969,
88
  "loss": 0.6932,
89
+ "rewards/accuracies": 0.19374999403953552,
90
+ "rewards/chosen": -0.0006545094074681401,
91
+ "rewards/margins": -0.0007917654584161937,
92
+ "rewards/rejected": 0.00013725618191529065,
93
  "step": 50
94
  },
95
  {
96
+ "epoch": 0.13,
97
+ "learning_rate": 4.9903533134293035e-06,
98
+ "logits/chosen": 0.8338711857795715,
99
+ "logits/rejected": 0.8361239433288574,
100
+ "logps/chosen": -105.47569274902344,
101
+ "logps/rejected": -103.5544662475586,
102
  "loss": 0.6931,
103
+ "rewards/accuracies": 0.26249998807907104,
104
+ "rewards/chosen": -6.698282231809571e-05,
105
+ "rewards/margins": -0.0007280272548086941,
106
+ "rewards/rejected": 0.0006610444979742169,
107
  "step": 60
108
  },
109
  {
110
+ "epoch": 0.15,
111
+ "learning_rate": 4.967625656594782e-06,
112
+ "logits/chosen": 0.8040585517883301,
113
+ "logits/rejected": 0.8523794412612915,
114
+ "logps/chosen": -83.11436462402344,
115
+ "logps/rejected": -77.95774841308594,
116
+ "loss": 0.6931,
117
+ "rewards/accuracies": 0.26875001192092896,
118
+ "rewards/chosen": -0.0002090830384986475,
119
+ "rewards/margins": -0.0001823374768719077,
120
+ "rewards/rejected": -2.6745503419078887e-05,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.17,
125
+ "learning_rate": 4.93167072587771e-06,
126
+ "logits/chosen": 0.8447190523147583,
127
+ "logits/rejected": 0.9011589288711548,
128
+ "logps/chosen": -101.15557098388672,
129
+ "logps/rejected": -84.94987487792969,
130
+ "loss": 0.693,
131
+ "rewards/accuracies": 0.3375000059604645,
132
+ "rewards/chosen": 0.00010431509872432798,
133
+ "rewards/margins": 0.000598998973146081,
134
+ "rewards/rejected": -0.0004946838016621768,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.19,
139
+ "learning_rate": 4.882681251368549e-06,
140
+ "logits/chosen": 0.8417055010795593,
141
+ "logits/rejected": 0.8667267560958862,
142
+ "logps/chosen": -88.30390930175781,
143
+ "logps/rejected": -110.51224517822266,
144
  "loss": 0.6931,
145
+ "rewards/accuracies": 0.24375000596046448,
146
+ "rewards/chosen": -0.0014625315088778734,
147
+ "rewards/margins": -0.0006530345417559147,
148
+ "rewards/rejected": -0.0008094970253296196,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.21,
153
+ "learning_rate": 4.8209198325401815e-06,
154
+ "logits/chosen": 0.8801371455192566,
155
+ "logits/rejected": 0.9127315282821655,
156
+ "logps/chosen": -86.3653793334961,
157
+ "logps/rejected": -69.10803985595703,
158
+ "loss": 0.693,
159
+ "rewards/accuracies": 0.22499999403953552,
160
+ "rewards/chosen": -0.0005041404510848224,
161
+ "rewards/margins": 0.0003193179436493665,
162
+ "rewards/rejected": -0.0008234584820456803,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.21,
167
+ "eval_logits/chosen": 0.7997792959213257,
168
+ "eval_logits/rejected": 0.8176011443138123,
169
+ "eval_logps/chosen": -91.77481842041016,
170
+ "eval_logps/rejected": -94.28044891357422,
171
+ "eval_loss": 0.6930604577064514,
172
+ "eval_rewards/accuracies": 0.2680000066757202,
173
+ "eval_rewards/chosen": -0.00047452302533201873,
174
+ "eval_rewards/margins": 0.00035985803697258234,
175
+ "eval_rewards/rejected": -0.0008343810332007706,
176
+ "eval_runtime": 273.126,
177
+ "eval_samples_per_second": 7.323,
178
+ "eval_steps_per_second": 0.458,
179
+ "step": 100
180
+ },
181
+ {
182
+ "epoch": 0.23,
183
+ "learning_rate": 4.746717530629565e-06,
184
+ "logits/chosen": 0.8060985803604126,
185
+ "logits/rejected": 0.8291690945625305,
186
+ "logps/chosen": -110.20963287353516,
187
+ "logps/rejected": -96.49766540527344,
188
+ "loss": 0.6929,
189
+ "rewards/accuracies": 0.26249998807907104,
190
+ "rewards/chosen": -0.0005915694055147469,
191
+ "rewards/margins": 0.00022233165509533137,
192
+ "rewards/rejected": -0.0008139010751619935,
193
+ "step": 110
194
+ },
195
+ {
196
+ "epoch": 0.25,
197
+ "learning_rate": 4.660472094042121e-06,
198
+ "logits/chosen": 0.8455309867858887,
199
+ "logits/rejected": 0.9219174385070801,
200
+ "logps/chosen": -84.42987060546875,
201
+ "logps/rejected": -80.36955261230469,
202
+ "loss": 0.6931,
203
+ "rewards/accuracies": 0.28125,
204
+ "rewards/chosen": -0.0005250257672742009,
205
+ "rewards/margins": 0.00011121686839032918,
206
+ "rewards/rejected": -0.0006362426793202758,
207
+ "step": 120
208
+ },
209
+ {
210
+ "epoch": 0.27,
211
+ "learning_rate": 4.5626458262912745e-06,
212
+ "logits/chosen": 0.7771774530410767,
213
+ "logits/rejected": 0.8348148465156555,
214
+ "logps/chosen": -91.55365753173828,
215
+ "logps/rejected": -80.16468811035156,
216
+ "loss": 0.693,
217
+ "rewards/accuracies": 0.26249998807907104,
218
+ "rewards/chosen": -0.0003579836920835078,
219
+ "rewards/margins": 0.0003911118838004768,
220
+ "rewards/rejected": -0.0007490954594686627,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.29,
225
+ "learning_rate": 4.453763107901676e-06,
226
+ "logits/chosen": 0.8080952763557434,
227
+ "logits/rejected": 0.8902841806411743,
228
+ "logps/chosen": -102.3683853149414,
229
+ "logps/rejected": -121.3236312866211,
230
+ "loss": 0.6929,
231
+ "rewards/accuracies": 0.34375,
232
+ "rewards/chosen": -0.0011437158100306988,
233
+ "rewards/margins": 0.00023913508630357683,
234
+ "rewards/rejected": -0.001382850925438106,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.31,
239
+ "learning_rate": 4.33440758555951e-06,
240
+ "logits/chosen": 0.8286693692207336,
241
+ "logits/rejected": 0.8885159492492676,
242
+ "logps/chosen": -98.10747528076172,
243
+ "logps/rejected": -75.61839294433594,
244
+ "loss": 0.6929,
245
+ "rewards/accuracies": 0.26875001192092896,
246
+ "rewards/chosen": -0.001014741021208465,
247
+ "rewards/margins": 0.00025303030270151794,
248
+ "rewards/rejected": -0.0012677714694291353,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.33,
253
+ "learning_rate": 4.205219043576955e-06,
254
+ "logits/chosen": 0.7178612947463989,
255
+ "logits/rejected": 0.7653802633285522,
256
+ "logps/chosen": -80.03684997558594,
257
+ "logps/rejected": -73.78407287597656,
258
+ "loss": 0.6928,
259
+ "rewards/accuracies": 0.2562499940395355,
260
+ "rewards/chosen": -0.001032956875860691,
261
+ "rewards/margins": 0.0007252781069837511,
262
+ "rewards/rejected": -0.0017582349246367812,
263
+ "step": 160
264
+ },
265
+ {
266
+ "epoch": 0.36,
267
+ "learning_rate": 4.066889974440757e-06,
268
+ "logits/chosen": 0.8685981631278992,
269
+ "logits/rejected": 0.9337660670280457,
270
+ "logps/chosen": -72.55715942382812,
271
+ "logps/rejected": -81.37494659423828,
272
+ "loss": 0.6929,
273
+ "rewards/accuracies": 0.26875001192092896,
274
+ "rewards/chosen": -0.0013817875878885388,
275
+ "rewards/margins": 0.0006504356861114502,
276
+ "rewards/rejected": -0.002032223390415311,
277
+ "step": 170
278
+ },
279
+ {
280
+ "epoch": 0.38,
281
+ "learning_rate": 3.92016186682789e-06,
282
+ "logits/chosen": 0.7872442007064819,
283
+ "logits/rejected": 0.8030352592468262,
284
+ "logps/chosen": -98.71012878417969,
285
+ "logps/rejected": -94.06107330322266,
286
+ "loss": 0.6928,
287
+ "rewards/accuracies": 0.26249998807907104,
288
+ "rewards/chosen": -0.0005101398564875126,
289
+ "rewards/margins": 0.0011010088492184877,
290
+ "rewards/rejected": -0.0016111487057060003,
291
+ "step": 180
292
+ },
293
+ {
294
+ "epoch": 0.4,
295
+ "learning_rate": 3.7658212309857576e-06,
296
+ "logits/chosen": 0.7398073077201843,
297
+ "logits/rejected": 0.8337036967277527,
298
+ "logps/chosen": -97.44853973388672,
299
+ "logps/rejected": -107.1574478149414,
300
+ "loss": 0.6926,
301
+ "rewards/accuracies": 0.3187499940395355,
302
+ "rewards/chosen": -0.0017274795100092888,
303
+ "rewards/margins": 0.0012662711087614298,
304
+ "rewards/rejected": -0.0029937506187707186,
305
+ "step": 190
306
+ },
307
+ {
308
+ "epoch": 0.42,
309
+ "learning_rate": 3.604695382782159e-06,
310
+ "logits/chosen": 0.7769767642021179,
311
+ "logits/rejected": 0.8381963968276978,
312
+ "logps/chosen": -77.53779602050781,
313
+ "logps/rejected": -86.9189224243164,
314
+ "loss": 0.6922,
315
+ "rewards/accuracies": 0.3125,
316
+ "rewards/chosen": -0.0016526943072676659,
317
+ "rewards/margins": 0.0023483093827962875,
318
+ "rewards/rejected": -0.004001003224402666,
319
+ "step": 200
320
+ },
321
+ {
322
+ "epoch": 0.42,
323
+ "eval_logits/chosen": 0.7941381931304932,
324
+ "eval_logits/rejected": 0.8121381998062134,
325
+ "eval_logps/chosen": -91.90680694580078,
326
+ "eval_logps/rejected": -94.51410675048828,
327
+ "eval_loss": 0.692441999912262,
328
+ "eval_rewards/accuracies": 0.3019999861717224,
329
+ "eval_rewards/chosen": -0.0017943703569471836,
330
+ "eval_rewards/margins": 0.001376640284433961,
331
+ "eval_rewards/rejected": -0.0031710106413811445,
332
+ "eval_runtime": 273.1548,
333
+ "eval_samples_per_second": 7.322,
334
+ "eval_steps_per_second": 0.458,
335
+ "step": 200
336
+ },
337
+ {
338
+ "epoch": 0.44,
339
+ "learning_rate": 3.437648009023905e-06,
340
+ "logits/chosen": 0.837793231010437,
341
+ "logits/rejected": 0.834112286567688,
342
+ "logps/chosen": -93.53672790527344,
343
+ "logps/rejected": -86.74687194824219,
344
+ "loss": 0.6926,
345
+ "rewards/accuracies": 0.2874999940395355,
346
+ "rewards/chosen": -0.0022572961170226336,
347
+ "rewards/margins": 0.0016887232195585966,
348
+ "rewards/rejected": -0.00394601933658123,
349
+ "step": 210
350
+ },
351
+ {
352
+ "epoch": 0.46,
353
+ "learning_rate": 3.265574537815398e-06,
354
+ "logits/chosen": 0.7990659475326538,
355
+ "logits/rejected": 0.8434357643127441,
356
+ "logps/chosen": -101.04875946044922,
357
+ "logps/rejected": -95.13258361816406,
358
+ "loss": 0.6927,
359
+ "rewards/accuracies": 0.3187499940395355,
360
+ "rewards/chosen": -0.002797973807901144,
361
+ "rewards/margins": 0.0009291036985814571,
362
+ "rewards/rejected": -0.003727077739313245,
363
+ "step": 220
364
+ },
365
+ {
366
+ "epoch": 0.48,
367
+ "learning_rate": 3.089397338773569e-06,
368
+ "logits/chosen": 0.8693816065788269,
369
+ "logits/rejected": 0.8417167663574219,
370
+ "logps/chosen": -88.75157165527344,
371
+ "logps/rejected": -93.32563018798828,
372
+ "loss": 0.6923,
373
+ "rewards/accuracies": 0.35624998807907104,
374
+ "rewards/chosen": -0.0035379533655941486,
375
+ "rewards/margins": 0.001493574702180922,
376
+ "rewards/rejected": -0.005031527951359749,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.5,
381
+ "learning_rate": 2.9100607788275547e-06,
382
+ "logits/chosen": 0.7980669736862183,
383
+ "logits/rejected": 0.8121516108512878,
384
+ "logps/chosen": -115.3040771484375,
385
+ "logps/rejected": -101.95747375488281,
386
+ "loss": 0.6917,
387
+ "rewards/accuracies": 0.36250001192092896,
388
+ "rewards/chosen": -0.0025563673116266727,
389
+ "rewards/margins": 0.0030108934734016657,
390
+ "rewards/rejected": -0.005567261017858982,
391
+ "step": 240
392
+ },
393
+ {
394
+ "epoch": 0.52,
395
+ "learning_rate": 2.72852616010567e-06,
396
+ "logits/chosen": 0.7765026688575745,
397
+ "logits/rejected": 0.8406999707221985,
398
+ "logps/chosen": -116.62477111816406,
399
+ "logps/rejected": -107.66845703125,
400
+ "loss": 0.692,
401
+ "rewards/accuracies": 0.36250001192092896,
402
+ "rewards/chosen": -0.0028718970715999603,
403
+ "rewards/margins": 0.0028597547207027674,
404
+ "rewards/rejected": -0.005731652025133371,
405
+ "step": 250
406
+ },
407
+ {
408
+ "epoch": 0.54,
409
+ "learning_rate": 2.5457665670441937e-06,
410
+ "logits/chosen": 0.7957627177238464,
411
+ "logits/rejected": 0.8751128911972046,
412
+ "logps/chosen": -99.27378845214844,
413
+ "logps/rejected": -88.92988586425781,
414
+ "loss": 0.6915,
415
+ "rewards/accuracies": 0.33125001192092896,
416
+ "rewards/chosen": -0.0027769659645855427,
417
+ "rewards/margins": 0.003127423347905278,
418
+ "rewards/rejected": -0.005904389079660177,
419
+ "step": 260
420
+ },
421
+ {
422
+ "epoch": 0.57,
423
+ "learning_rate": 2.3627616503391813e-06,
424
+ "logits/chosen": 0.8911062479019165,
425
+ "logits/rejected": 0.9384480714797974,
426
+ "logps/chosen": -114.53426361083984,
427
+ "logps/rejected": -83.79386901855469,
428
+ "loss": 0.6917,
429
+ "rewards/accuracies": 0.4124999940395355,
430
+ "rewards/chosen": -0.0030242924112826586,
431
+ "rewards/margins": 0.0038346436340361834,
432
+ "rewards/rejected": -0.006858936045318842,
433
+ "step": 270
434
+ },
435
+ {
436
+ "epoch": 0.59,
437
+ "learning_rate": 2.1804923757009885e-06,
438
+ "logits/chosen": 0.8408538699150085,
439
+ "logits/rejected": 0.839997410774231,
440
+ "logps/chosen": -121.15773010253906,
441
+ "logps/rejected": -109.73968505859375,
442
+ "loss": 0.6916,
443
+ "rewards/accuracies": 0.38749998807907104,
444
+ "rewards/chosen": -0.0034516516607254744,
445
+ "rewards/margins": 0.0034923548810184,
446
+ "rewards/rejected": -0.006944006774574518,
447
+ "step": 280
448
+ },
449
+ {
450
+ "epoch": 0.61,
451
+ "learning_rate": 1.9999357655598894e-06,
452
+ "logits/chosen": 0.825127124786377,
453
+ "logits/rejected": 0.8463200330734253,
454
+ "logps/chosen": -94.99199676513672,
455
+ "logps/rejected": -92.08692932128906,
456
+ "loss": 0.6919,
457
  "rewards/accuracies": 0.2750000059604645,
458
+ "rewards/chosen": -0.005209151655435562,
459
+ "rewards/margins": 0.0012942428002133965,
460
+ "rewards/rejected": -0.006503394804894924,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 0.63,
465
+ "learning_rate": 1.8220596619089576e-06,
466
+ "logits/chosen": 0.7896022796630859,
467
+ "logits/rejected": 0.8404830694198608,
468
+ "logps/chosen": -79.6166763305664,
469
+ "logps/rejected": -87.97811126708984,
470
+ "loss": 0.6917,
471
+ "rewards/accuracies": 0.3125,
472
+ "rewards/chosen": -0.002977000316604972,
473
+ "rewards/margins": 0.004023983143270016,
474
+ "rewards/rejected": -0.007000982761383057,
475
+ "step": 300
476
+ },
477
+ {
478
+ "epoch": 0.63,
479
+ "eval_logits/chosen": 0.787030041217804,
480
+ "eval_logits/rejected": 0.8057210445404053,
481
+ "eval_logps/chosen": -92.21893310546875,
482
+ "eval_logps/rejected": -94.9659194946289,
483
+ "eval_loss": 0.6917064189910889,
484
+ "eval_rewards/accuracies": 0.3100000023841858,
485
+ "eval_rewards/chosen": -0.004915657918900251,
486
+ "eval_rewards/margins": 0.0027734539471566677,
487
+ "eval_rewards/rejected": -0.0076891109347343445,
488
+ "eval_runtime": 273.1314,
489
+ "eval_samples_per_second": 7.322,
490
+ "eval_steps_per_second": 0.458,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.65,
495
+ "learning_rate": 1.647817538357072e-06,
496
+ "logits/chosen": 0.797222912311554,
497
+ "logits/rejected": 0.8298647999763489,
498
+ "logps/chosen": -85.97596740722656,
499
+ "logps/rejected": -87.59102630615234,
500
+ "loss": 0.6915,
501
+ "rewards/accuracies": 0.33125001192092896,
502
+ "rewards/chosen": -0.005664575379341841,
503
+ "rewards/margins": 0.002779053058475256,
504
+ "rewards/rejected": -0.008443629369139671,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.67,
509
+ "learning_rate": 1.4781433892011132e-06,
510
+ "logits/chosen": 0.7955067753791809,
511
+ "logits/rejected": 0.8460105657577515,
512
+ "logps/chosen": -88.00541687011719,
513
+ "logps/rejected": -91.3482437133789,
514
+ "loss": 0.6916,
515
+ "rewards/accuracies": 0.3687500059604645,
516
+ "rewards/chosen": -0.005240852013230324,
517
+ "rewards/margins": 0.003471267642453313,
518
+ "rewards/rejected": -0.008712120354175568,
519
+ "step": 320
520
+ },
521
+ {
522
+ "epoch": 0.69,
523
+ "learning_rate": 1.3139467229135999e-06,
524
+ "logits/chosen": 0.7453655004501343,
525
+ "logits/rejected": 0.7835742235183716,
526
+ "logps/chosen": -101.49515533447266,
527
+ "logps/rejected": -102.06614685058594,
528
+ "loss": 0.6914,
529
+ "rewards/accuracies": 0.33125001192092896,
530
+ "rewards/chosen": -0.00533133652061224,
531
+ "rewards/margins": 0.004351903218775988,
532
+ "rewards/rejected": -0.00968323927372694,
533
+ "step": 330
534
+ },
535
+ {
536
+ "epoch": 0.71,
537
+ "learning_rate": 1.1561076868822756e-06,
538
+ "logits/chosen": 0.7904757857322693,
539
+ "logits/rejected": 0.8207653760910034,
540
+ "logps/chosen": -83.08015441894531,
541
+ "logps/rejected": -73.13031768798828,
542
+ "loss": 0.6908,
543
+ "rewards/accuracies": 0.29374998807907104,
544
+ "rewards/chosen": -0.002843918278813362,
545
+ "rewards/margins": 0.005075609777122736,
546
+ "rewards/rejected": -0.007919528521597385,
547
+ "step": 340
548
+ },
549
+ {
550
+ "epoch": 0.73,
551
+ "learning_rate": 1.0054723495346484e-06,
552
+ "logits/chosen": 0.8648473620414734,
553
+ "logits/rejected": 0.88224858045578,
554
+ "logps/chosen": -88.79496002197266,
555
+ "logps/rejected": -82.97408294677734,
556
+ "loss": 0.6911,
557
+ "rewards/accuracies": 0.36250001192092896,
558
+ "rewards/chosen": -0.0072964876890182495,
559
+ "rewards/margins": 0.003556302282959223,
560
+ "rewards/rejected": -0.01085279043763876,
561
+ "step": 350
562
+ },
563
+ {
564
+ "epoch": 0.75,
565
+ "learning_rate": 8.628481651367876e-07,
566
+ "logits/chosen": 0.7940279841423035,
567
+ "logits/rejected": 0.808300793170929,
568
+ "logps/chosen": -95.09690856933594,
569
+ "logps/rejected": -98.6402816772461,
570
+ "loss": 0.6911,
571
+ "rewards/accuracies": 0.375,
572
+ "rewards/chosen": -0.007194930221885443,
573
+ "rewards/margins": 0.004204220604151487,
574
+ "rewards/rejected": -0.01139915082603693,
575
+ "step": 360
576
+ },
577
+ {
578
+ "epoch": 0.77,
579
+ "learning_rate": 7.289996455765749e-07,
580
+ "logits/chosen": 0.8749688863754272,
581
+ "logits/rejected": 0.8549054861068726,
582
+ "logps/chosen": -107.79057312011719,
583
+ "logps/rejected": -104.4919662475586,
584
+ "loss": 0.6913,
585
+ "rewards/accuracies": 0.33125001192092896,
586
+ "rewards/chosen": -0.005484831985086203,
587
+ "rewards/margins": 0.0049090199172496796,
588
+ "rewards/rejected": -0.01039385236799717,
589
+ "step": 370
590
+ },
591
+ {
592
+ "epoch": 0.8,
593
+ "learning_rate": 6.046442623320145e-07,
594
+ "logits/chosen": 0.8186184167861938,
595
+ "logits/rejected": 0.8605905771255493,
596
+ "logps/chosen": -102.05201721191406,
597
+ "logps/rejected": -88.45350646972656,
598
+ "loss": 0.6905,
599
+ "rewards/accuracies": 0.35624998807907104,
600
+ "rewards/chosen": -0.006084176246076822,
601
+ "rewards/margins": 0.004532010294497013,
602
+ "rewards/rejected": -0.010616186074912548,
603
+ "step": 380
604
+ },
605
+ {
606
+ "epoch": 0.82,
607
+ "learning_rate": 4.904486005914027e-07,
608
+ "logits/chosen": 0.8054735064506531,
609
+ "logits/rejected": 0.8569159507751465,
610
+ "logps/chosen": -110.53129577636719,
611
+ "logps/rejected": -99.7970199584961,
612
+ "loss": 0.6905,
613
+ "rewards/accuracies": 0.35624998807907104,
614
+ "rewards/chosen": -0.0049868240021169186,
615
+ "rewards/margins": 0.005900658201426268,
616
+ "rewards/rejected": -0.01088748313486576,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 0.84,
621
+ "learning_rate": 3.8702478614051353e-07,
622
+ "logits/chosen": 0.8212454915046692,
623
+ "logits/rejected": 0.8515514135360718,
624
+ "logps/chosen": -100.35879516601562,
625
+ "logps/rejected": -103.1037368774414,
626
+ "loss": 0.6905,
627
+ "rewards/accuracies": 0.3375000059604645,
628
+ "rewards/chosen": -0.006732915993779898,
629
+ "rewards/margins": 0.004675927106291056,
630
+ "rewards/rejected": -0.011408843100070953,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 0.84,
635
+ "eval_logits/chosen": 0.7826990485191345,
636
+ "eval_logits/rejected": 0.8012421727180481,
637
+ "eval_logps/chosen": -92.42469787597656,
638
+ "eval_logps/rejected": -95.25089263916016,
639
+ "eval_loss": 0.6913294792175293,
640
+ "eval_rewards/accuracies": 0.328000009059906,
641
+ "eval_rewards/chosen": -0.00697335647419095,
642
+ "eval_rewards/margins": 0.0035653903614729643,
643
+ "eval_rewards/rejected": -0.010538745671510696,
644
+ "eval_runtime": 273.6535,
645
+ "eval_samples_per_second": 7.309,
646
+ "eval_steps_per_second": 0.457,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 0.86,
651
+ "learning_rate": 2.9492720416985004e-07,
652
+ "logits/chosen": 0.8186850547790527,
653
+ "logits/rejected": 0.8151271939277649,
654
+ "logps/chosen": -95.82102966308594,
655
+ "logps/rejected": -80.586669921875,
656
+ "loss": 0.6916,
657
+ "rewards/accuracies": 0.3375000059604645,
658
+ "rewards/chosen": -0.00794359389692545,
659
+ "rewards/margins": 0.003056485904380679,
660
+ "rewards/rejected": -0.011000080034136772,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 0.88,
665
+ "learning_rate": 2.1464952759020857e-07,
666
+ "logits/chosen": 0.7571959495544434,
667
+ "logits/rejected": 0.8163139224052429,
668
+ "logps/chosen": -95.65677642822266,
669
+ "logps/rejected": -97.86299896240234,
670
+ "loss": 0.6915,
671
+ "rewards/accuracies": 0.3125,
672
+ "rewards/chosen": -0.00859010498970747,
673
+ "rewards/margins": 0.0028384437318891287,
674
+ "rewards/rejected": -0.011428548023104668,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 0.9,
679
+ "learning_rate": 1.4662207078575685e-07,
680
+ "logits/chosen": 0.8645572662353516,
681
+ "logits/rejected": 0.887597918510437,
682
+ "logps/chosen": -83.41182708740234,
683
+ "logps/rejected": -90.17640686035156,
684
+ "loss": 0.6913,
685
+ "rewards/accuracies": 0.39375001192092896,
686
+ "rewards/chosen": -0.006618577986955643,
687
+ "rewards/margins": 0.004637080244719982,
688
+ "rewards/rejected": -0.011255658231675625,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 0.92,
693
+ "learning_rate": 9.120948298936422e-08,
694
+ "logits/chosen": 0.7825466394424438,
695
+ "logits/rejected": 0.8328782916069031,
696
+ "logps/chosen": -109.79942321777344,
697
+ "logps/rejected": -82.07637023925781,
698
+ "loss": 0.6911,
699
+ "rewards/accuracies": 0.34375,
700
+ "rewards/chosen": -0.005336672533303499,
701
+ "rewards/margins": 0.005987245589494705,
702
+ "rewards/rejected": -0.011323917657136917,
703
+ "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
+ "learning_rate": 4.870879364444109e-08,
708
+ "logits/chosen": 0.774901270866394,
709
+ "logits/rejected": 0.8499285578727722,
710
+ "logps/chosen": -81.24304962158203,
711
+ "logps/rejected": -71.88755798339844,
712
+ "loss": 0.6911,
713
+ "rewards/accuracies": 0.29374998807907104,
714
+ "rewards/chosen": -0.005921828560531139,
715
+ "rewards/margins": 0.0035197760444134474,
716
+ "rewards/rejected": -0.00944160483777523,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 0.96,
721
+ "learning_rate": 1.93478202307823e-08,
722
+ "logits/chosen": 0.8049672842025757,
723
+ "logits/rejected": 0.8570533990859985,
724
+ "logps/chosen": -105.7918930053711,
725
+ "logps/rejected": -87.53938293457031,
726
+ "loss": 0.6907,
727
+ "rewards/accuracies": 0.3187499940395355,
728
+ "rewards/chosen": -0.005085950251668692,
729
+ "rewards/margins": 0.0068057505413889885,
730
+ "rewards/rejected": -0.011891700327396393,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 0.98,
735
+ "learning_rate": 3.283947088983663e-09,
736
+ "logits/chosen": 0.8561135530471802,
737
+ "logits/rejected": 0.8520036935806274,
738
+ "logps/chosen": -115.6650390625,
739
+ "logps/rejected": -110.88753509521484,
740
+ "loss": 0.6903,
741
+ "rewards/accuracies": 0.39375001192092896,
742
+ "rewards/chosen": -0.006030657794326544,
743
+ "rewards/margins": 0.00688832625746727,
744
+ "rewards/rejected": -0.012918984517455101,
745
+ "step": 470
746
  },
747
  {
748
+ "epoch": 1.0,
749
+ "step": 477,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.6920521804121805,
752
+ "train_runtime": 6813.2628,
753
+ "train_samples_per_second": 4.486,
754
+ "train_steps_per_second": 0.07
755
  }
756
  ],
757
  "logging_steps": 10,
758
+ "max_steps": 477,
759
  "num_input_tokens_seen": 0,
760
  "num_train_epochs": 1,
761
  "save_steps": 100,