Attila1011 commited on
Commit
cc46ea0
·
verified ·
1 Parent(s): 4ddc7a5

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -75,3 +75,4 @@ checkpoints-v4.7/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs
75
  checkpoints-v4.7/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
76
  checkpoints-d1.0/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
77
  checkpoints-d1.1/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
75
  checkpoints-v4.7/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
76
  checkpoints-d1.0/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
77
  checkpoints-d1.1/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
78
+ checkpoints-d1.2/checkpoint-20480/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-d1.2/checkpoint-20480/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53a1581a5544d90077a4d840b7e995494004238d61b9c2bb60b44022dfae684d
3
+ size 746712
checkpoints-d1.2/checkpoint-20480/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a7540f8231769744119c8f866f5f6e59922662bb3c88a7ee54c6a2929955e6
3
+ size 57291683
checkpoints-d1.2/checkpoint-20480/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49aca4871d305477f1a55b8fa70c80d7b64f6e0c7472d450112066b2c547273
3
+ size 24002016
checkpoints-d1.2/checkpoint-20480/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9f44f191e57d29316dab18d8a7241a4b05f16796adc80f5d38b191740863f1e
3
+ size 1569995
checkpoints-d1.2/checkpoint-20480/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3907148616e79502a6eaa138edd3484189a70786a827a780f4570ec586ce4eb
3
+ size 14645
checkpoints-d1.2/checkpoint-20480/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb57fb715619019e15e618930b0e1c9ce3934b3244dfc90e5cfc6b0cdfd9ea3d
3
+ size 1383
checkpoints-d1.2/checkpoint-20480/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954ef71f1aa3a58a631c831e0b5b8b3292693bf5ec78c0ee303ad115a4bb3d00
3
+ size 1465
checkpoints-d1.2/checkpoint-20480/trainer_state.json ADDED
@@ -0,0 +1,1334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8826254659857349,
6
+ "eval_steps": 1024,
7
+ "global_step": 20480,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011032818324821687,
14
+ "grad_norm": 0.12727995216846466,
15
+ "learning_rate": 0.000498046875,
16
+ "loss": 2.2427074909210205,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.022065636649643373,
21
+ "grad_norm": 0.37040260434150696,
22
+ "learning_rate": 0.000998046875,
23
+ "loss": 1.787421464920044,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03309845497446506,
28
+ "grad_norm": 0.42376086115837097,
29
+ "learning_rate": 0.000999688448778502,
30
+ "loss": 1.6120189428329468,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "grad_norm": 0.728442370891571,
36
+ "learning_rate": 0.0009987492950653055,
37
+ "loss": 1.5639891624450684,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.04413127329928675,
42
+ "eval_bleu": 0.9365129221049671,
43
+ "eval_cos_loss": 0.4736882319836728,
44
+ "eval_dec_loss": 0.11874023166252796,
45
+ "eval_loss": 1.5516616545760555,
46
+ "eval_mse2_loss": 0.1660867908647828,
47
+ "eval_mse3_loss": 0.04509278692241544,
48
+ "eval_mse_loss": 1.340482075839663,
49
+ "eval_rec_loss": 0.047009017791098624,
50
+ "eval_var_loss": 0.01723895594080501,
51
+ "flow/cos_sim": 0.5263117668089836,
52
+ "flow/improvement_ratio": 0.8939692384398568,
53
+ "flow/mag_ratio_mean": 0.5442470624121521,
54
+ "flow/mag_ratio_std": 0.24557339487426572,
55
+ "step": 1024
56
+ },
57
+ {
58
+ "epoch": 0.04413127329928675,
59
+ "eval_bleu": 0.9365129221049671,
60
+ "eval_cos_loss": 0.4736882319836728,
61
+ "eval_dec_loss": 0.11874023166252796,
62
+ "eval_loss": 1.5516616545760555,
63
+ "eval_mse2_loss": 0.1660867908647828,
64
+ "eval_mse3_loss": 0.04509278692241544,
65
+ "eval_mse_loss": 1.340482075839663,
66
+ "eval_rec_loss": 0.047009017791098624,
67
+ "eval_runtime": 152.4374,
68
+ "eval_samples_per_second": 196.802,
69
+ "eval_steps_per_second": 3.077,
70
+ "eval_var_loss": 0.01723895594080501,
71
+ "flow/cos_sim": 0.5263117668089836,
72
+ "flow/improvement_ratio": 0.8939692384398568,
73
+ "flow/mag_ratio_mean": 0.5442470624121521,
74
+ "flow/mag_ratio_std": 0.24557339487426572,
75
+ "step": 1024
76
+ },
77
+ {
78
+ "epoch": 0.05516409162410843,
79
+ "grad_norm": 0.5939557552337646,
80
+ "learning_rate": 0.0009971837136430763,
81
+ "loss": 1.5434116125106812,
82
+ "step": 1280
83
+ },
84
+ {
85
+ "epoch": 0.06619690994893011,
86
+ "grad_norm": 0.5294345021247864,
87
+ "learning_rate": 0.0009949936708776692,
88
+ "loss": 1.522328495979309,
89
+ "step": 1536
90
+ },
91
+ {
92
+ "epoch": 0.07722972827375181,
93
+ "grad_norm": 1.3655198812484741,
94
+ "learning_rate": 0.0009921819174566252,
95
+ "loss": 1.514147162437439,
96
+ "step": 1792
97
+ },
98
+ {
99
+ "epoch": 0.0882625465985735,
100
+ "grad_norm": 0.7738495469093323,
101
+ "learning_rate": 0.000988751984934317,
102
+ "loss": 1.5043810606002808,
103
+ "step": 2048
104
+ },
105
+ {
106
+ "epoch": 0.0882625465985735,
107
+ "eval_bleu": 0.9391092887836987,
108
+ "eval_cos_loss": 0.45917267693893743,
109
+ "eval_dec_loss": 0.1044372236793007,
110
+ "eval_loss": 1.5007227758354724,
111
+ "eval_mse2_loss": 0.15699366443574048,
112
+ "eval_mse3_loss": 0.0420123795543843,
113
+ "eval_mse_loss": 1.3017167356222676,
114
+ "eval_rec_loss": 0.047009017791098624,
115
+ "eval_var_loss": 0.01723895594080501,
116
+ "flow/cos_sim": 0.5408273230610626,
117
+ "flow/improvement_ratio": 0.8913466692733358,
118
+ "flow/mag_ratio_mean": 0.5480725960945015,
119
+ "flow/mag_ratio_std": 0.25143888014466015,
120
+ "step": 2048
121
+ },
122
+ {
123
+ "epoch": 0.0882625465985735,
124
+ "eval_bleu": 0.9391092887836987,
125
+ "eval_cos_loss": 0.45917267693893743,
126
+ "eval_dec_loss": 0.1044372236793007,
127
+ "eval_loss": 1.5007227758354724,
128
+ "eval_mse2_loss": 0.15699366443574048,
129
+ "eval_mse3_loss": 0.0420123795543843,
130
+ "eval_mse_loss": 1.3017167356222676,
131
+ "eval_rec_loss": 0.047009017791098624,
132
+ "eval_runtime": 149.7255,
133
+ "eval_samples_per_second": 200.367,
134
+ "eval_steps_per_second": 3.132,
135
+ "eval_var_loss": 0.01723895594080501,
136
+ "flow/cos_sim": 0.5408273230610626,
137
+ "flow/improvement_ratio": 0.8913466692733358,
138
+ "flow/mag_ratio_mean": 0.5480725960945015,
139
+ "flow/mag_ratio_std": 0.25143888014466015,
140
+ "step": 2048
141
+ },
142
+ {
143
+ "epoch": 0.09929536492339518,
144
+ "grad_norm": 0.723221480846405,
145
+ "learning_rate": 0.0009847081812963268,
146
+ "loss": 1.496282696723938,
147
+ "step": 2304
148
+ },
149
+ {
150
+ "epoch": 0.11032818324821686,
151
+ "grad_norm": 0.8804053068161011,
152
+ "learning_rate": 0.0009800555855486275,
153
+ "loss": 1.4938578605651855,
154
+ "step": 2560
155
+ },
156
+ {
157
+ "epoch": 0.12136100157303854,
158
+ "grad_norm": 0.9812144637107849,
159
+ "learning_rate": 0.0009748000413383664,
160
+ "loss": 1.4887880086898804,
161
+ "step": 2816
162
+ },
163
+ {
164
+ "epoch": 0.13239381989786023,
165
+ "grad_norm": 1.3136183023452759,
166
+ "learning_rate": 0.0009689481496142604,
167
+ "loss": 1.4831873178482056,
168
+ "step": 3072
169
+ },
170
+ {
171
+ "epoch": 0.13239381989786023,
172
+ "eval_bleu": 0.9371003601305814,
173
+ "eval_cos_loss": 0.4514941948690394,
174
+ "eval_dec_loss": 0.11555046006354061,
175
+ "eval_loss": 1.478005530483433,
176
+ "eval_mse2_loss": 0.15441496636885316,
177
+ "eval_mse3_loss": 0.04251180985557245,
178
+ "eval_mse_loss": 1.2810787536950508,
179
+ "eval_rec_loss": 0.047009017791098624,
180
+ "eval_var_loss": 0.01723895594080501,
181
+ "flow/cos_sim": 0.5485058038600726,
182
+ "flow/improvement_ratio": 0.8955106252292072,
183
+ "flow/mag_ratio_mean": 0.5588331800788197,
184
+ "flow/mag_ratio_std": 0.2562640742071148,
185
+ "step": 3072
186
+ },
187
+ {
188
+ "epoch": 0.13239381989786023,
189
+ "eval_bleu": 0.9371003601305814,
190
+ "eval_cos_loss": 0.4514941948690394,
191
+ "eval_dec_loss": 0.11555046006354061,
192
+ "eval_loss": 1.478005530483433,
193
+ "eval_mse2_loss": 0.15441496636885316,
194
+ "eval_mse3_loss": 0.04251180985557245,
195
+ "eval_mse_loss": 1.2810787536950508,
196
+ "eval_rec_loss": 0.047009017791098624,
197
+ "eval_runtime": 149.3327,
198
+ "eval_samples_per_second": 200.894,
199
+ "eval_steps_per_second": 3.141,
200
+ "eval_var_loss": 0.01723895594080501,
201
+ "flow/cos_sim": 0.5485058038600726,
202
+ "flow/improvement_ratio": 0.8955106252292072,
203
+ "flow/mag_ratio_mean": 0.5588331800788197,
204
+ "flow/mag_ratio_std": 0.2562640742071148,
205
+ "step": 3072
206
+ },
207
+ {
208
+ "epoch": 0.14342663822268192,
209
+ "grad_norm": 0.7567078471183777,
210
+ "learning_rate": 0.0009625072603358231,
211
+ "loss": 1.4780577421188354,
212
+ "step": 3328
213
+ },
214
+ {
215
+ "epoch": 0.15445945654750362,
216
+ "grad_norm": 0.9316087365150452,
217
+ "learning_rate": 0.0009554854632418371,
218
+ "loss": 1.4720772504806519,
219
+ "step": 3584
220
+ },
221
+ {
222
+ "epoch": 0.1654922748723253,
223
+ "grad_norm": 0.8111740946769714,
224
+ "learning_rate": 0.000947891577689663,
225
+ "loss": 1.469363808631897,
226
+ "step": 3840
227
+ },
228
+ {
229
+ "epoch": 0.176525093197147,
230
+ "grad_norm": 1.054283618927002,
231
+ "learning_rate": 0.0009397351415781539,
232
+ "loss": 1.46699059009552,
233
+ "step": 4096
234
+ },
235
+ {
236
+ "epoch": 0.176525093197147,
237
+ "eval_bleu": 0.9394680773832225,
238
+ "eval_cos_loss": 0.44803570569959533,
239
+ "eval_dec_loss": 0.10979667206459653,
240
+ "eval_loss": 1.4644885746909102,
241
+ "eval_mse2_loss": 0.15139158122511562,
242
+ "eval_mse3_loss": 0.04131953247876437,
243
+ "eval_mse_loss": 1.271777458790777,
244
+ "eval_rec_loss": 0.047009017791098624,
245
+ "eval_var_loss": 0.01723895594080501,
246
+ "flow/cos_sim": 0.5519642917586288,
247
+ "flow/improvement_ratio": 0.8933102408451821,
248
+ "flow/mag_ratio_mean": 0.5608082282771942,
249
+ "flow/mag_ratio_std": 0.25901126921939444,
250
+ "step": 4096
251
+ },
252
+ {
253
+ "epoch": 0.176525093197147,
254
+ "eval_bleu": 0.9394680773832225,
255
+ "eval_cos_loss": 0.44803570569959533,
256
+ "eval_dec_loss": 0.10979667206459653,
257
+ "eval_loss": 1.4644885746909102,
258
+ "eval_mse2_loss": 0.15139158122511562,
259
+ "eval_mse3_loss": 0.04131953247876437,
260
+ "eval_mse_loss": 1.271777458790777,
261
+ "eval_rec_loss": 0.047009017791098624,
262
+ "eval_runtime": 150.2566,
263
+ "eval_samples_per_second": 199.658,
264
+ "eval_steps_per_second": 3.121,
265
+ "eval_var_loss": 0.01723895594080501,
266
+ "flow/cos_sim": 0.5519642917586288,
267
+ "flow/improvement_ratio": 0.8933102408451821,
268
+ "flow/mag_ratio_mean": 0.5608082282771942,
269
+ "flow/mag_ratio_std": 0.25901126921939444,
270
+ "step": 4096
271
+ },
272
+ {
273
+ "epoch": 0.18755791152196866,
274
+ "grad_norm": 1.008694052696228,
275
+ "learning_rate": 0.000931026399368079,
276
+ "loss": 1.4621409177780151,
277
+ "step": 4352
278
+ },
279
+ {
280
+ "epoch": 0.19859072984679035,
281
+ "grad_norm": 1.2265310287475586,
282
+ "learning_rate": 0.0009217762892151117,
283
+ "loss": 1.4644461870193481,
284
+ "step": 4608
285
+ },
286
+ {
287
+ "epoch": 0.20962354817161205,
288
+ "grad_norm": 0.8491294384002686,
289
+ "learning_rate": 0.0009119964292315354,
290
+ "loss": 1.4609150886535645,
291
+ "step": 4864
292
+ },
293
+ {
294
+ "epoch": 0.22065636649643372,
295
+ "grad_norm": 0.8653346300125122,
296
+ "learning_rate": 0.0009016991028939279,
297
+ "loss": 1.4556578397750854,
298
+ "step": 5120
299
+ },
300
+ {
301
+ "epoch": 0.22065636649643372,
302
+ "eval_bleu": 0.9395664278045301,
303
+ "eval_cos_loss": 0.4446420454775601,
304
+ "eval_dec_loss": 0.10941354105713716,
305
+ "eval_loss": 1.4521360926028253,
306
+ "eval_mse2_loss": 0.14856901723565832,
307
+ "eval_mse3_loss": 0.04055595990103572,
308
+ "eval_mse_loss": 1.26301111900476,
309
+ "eval_rec_loss": 0.047009017791098624,
310
+ "eval_var_loss": 0.01723895594080501,
311
+ "flow/cos_sim": 0.555357955666239,
312
+ "flow/improvement_ratio": 0.896109628270684,
313
+ "flow/mag_ratio_mean": 0.5672629550575956,
314
+ "flow/mag_ratio_std": 0.2583374333089349,
315
+ "step": 5120
316
+ },
317
+ {
318
+ "epoch": 0.22065636649643372,
319
+ "eval_bleu": 0.9395664278045301,
320
+ "eval_cos_loss": 0.4446420454775601,
321
+ "eval_dec_loss": 0.10941354105713716,
322
+ "eval_loss": 1.4521360926028253,
323
+ "eval_mse2_loss": 0.14856901723565832,
324
+ "eval_mse3_loss": 0.04055595990103572,
325
+ "eval_mse_loss": 1.26301111900476,
326
+ "eval_rec_loss": 0.047009017791098624,
327
+ "eval_runtime": 150.4389,
328
+ "eval_samples_per_second": 199.416,
329
+ "eval_steps_per_second": 3.118,
330
+ "eval_var_loss": 0.01723895594080501,
331
+ "flow/cos_sim": 0.555357955666239,
332
+ "flow/improvement_ratio": 0.896109628270684,
333
+ "flow/mag_ratio_mean": 0.5672629550575956,
334
+ "flow/mag_ratio_std": 0.2583374333089349,
335
+ "step": 5120
336
+ },
337
+ {
338
+ "epoch": 0.23168918482125542,
339
+ "grad_norm": 0.7437342405319214,
340
+ "learning_rate": 0.0008908972436151494,
341
+ "loss": 1.4530967473983765,
342
+ "step": 5376
343
+ },
344
+ {
345
+ "epoch": 0.2427220031460771,
346
+ "grad_norm": 1.2461498975753784,
347
+ "learning_rate": 0.0008796044185000127,
348
+ "loss": 1.4508432149887085,
349
+ "step": 5632
350
+ },
351
+ {
352
+ "epoch": 0.2537548214708988,
353
+ "grad_norm": 0.9174250364303589,
354
+ "learning_rate": 0.0008678348113050368,
355
+ "loss": 1.445737600326538,
356
+ "step": 5888
357
+ },
358
+ {
359
+ "epoch": 0.26478763979572045,
360
+ "grad_norm": 0.7107803821563721,
361
+ "learning_rate": 0.0008556032046236897,
362
+ "loss": 1.448463797569275,
363
+ "step": 6144
364
+ },
365
+ {
366
+ "epoch": 0.26478763979572045,
367
+ "eval_bleu": 0.9383079878806191,
368
+ "eval_cos_loss": 0.44315467447614365,
369
+ "eval_dec_loss": 0.11260034935846766,
370
+ "eval_loss": 1.4483204581844273,
371
+ "eval_mse2_loss": 0.14934769469791892,
372
+ "eval_mse3_loss": 0.04125582999281728,
373
+ "eval_mse_loss": 1.257716933293129,
374
+ "eval_rec_loss": 0.047009017791098624,
375
+ "eval_var_loss": 0.01723895594080501,
376
+ "flow/cos_sim": 0.5568453247613235,
377
+ "flow/improvement_ratio": 0.8934365713011736,
378
+ "flow/mag_ratio_mean": 0.5627565476685953,
379
+ "flow/mag_ratio_std": 0.26418959474894027,
380
+ "step": 6144
381
+ },
382
+ {
383
+ "epoch": 0.26478763979572045,
384
+ "eval_bleu": 0.9383079878806191,
385
+ "eval_cos_loss": 0.44315467447614365,
386
+ "eval_dec_loss": 0.11260034935846766,
387
+ "eval_loss": 1.4483204581844273,
388
+ "eval_mse2_loss": 0.14934769469791892,
389
+ "eval_mse3_loss": 0.04125582999281728,
390
+ "eval_mse_loss": 1.257716933293129,
391
+ "eval_rec_loss": 0.047009017791098624,
392
+ "eval_runtime": 151.7824,
393
+ "eval_samples_per_second": 197.651,
394
+ "eval_steps_per_second": 3.09,
395
+ "eval_var_loss": 0.01723895594080501,
396
+ "flow/cos_sim": 0.5568453247613235,
397
+ "flow/improvement_ratio": 0.8934365713011736,
398
+ "flow/mag_ratio_mean": 0.5627565476685953,
399
+ "flow/mag_ratio_std": 0.26418959474894027,
400
+ "step": 6144
401
+ },
402
+ {
403
+ "epoch": 0.2758204581205422,
404
+ "grad_norm": 0.9640243649482727,
405
+ "learning_rate": 0.000842924961319492,
406
+ "loss": 1.4444574117660522,
407
+ "step": 6400
408
+ },
409
+ {
410
+ "epoch": 0.28685327644536385,
411
+ "grad_norm": 0.897240161895752,
412
+ "learning_rate": 0.0008298160052303045,
413
+ "loss": 1.4424360990524292,
414
+ "step": 6656
415
+ },
416
+ {
417
+ "epoch": 0.2978860947701855,
418
+ "grad_norm": 0.8083540201187134,
419
+ "learning_rate": 0.0008162928011680314,
420
+ "loss": 1.4411871433258057,
421
+ "step": 6912
422
+ },
423
+ {
424
+ "epoch": 0.30891891309500724,
425
+ "grad_norm": 1.0060639381408691,
426
+ "learning_rate": 0.000802372334238864,
427
+ "loss": 1.439369559288025,
428
+ "step": 7168
429
+ },
430
+ {
431
+ "epoch": 0.30891891309500724,
432
+ "eval_bleu": 0.9390257063353732,
433
+ "eval_cos_loss": 0.4395336633933378,
434
+ "eval_dec_loss": 0.11084323670905727,
435
+ "eval_loss": 1.4360539821673557,
436
+ "eval_mse2_loss": 0.14675505667416525,
437
+ "eval_mse3_loss": 0.04034361798070006,
438
+ "eval_mse_loss": 1.2489553087555778,
439
+ "eval_rec_loss": 0.047009017791098624,
440
+ "eval_var_loss": 0.01723895594080501,
441
+ "flow/cos_sim": 0.5604663359712182,
442
+ "flow/improvement_ratio": 0.8947566227872235,
443
+ "flow/mag_ratio_mean": 0.5714088110273072,
444
+ "flow/mag_ratio_std": 0.2653434300092238,
445
+ "step": 7168
446
+ },
447
+ {
448
+ "epoch": 0.30891891309500724,
449
+ "eval_bleu": 0.9390257063353732,
450
+ "eval_cos_loss": 0.4395336633933378,
451
+ "eval_dec_loss": 0.11084323670905727,
452
+ "eval_loss": 1.4360539821673557,
453
+ "eval_mse2_loss": 0.14675505667416525,
454
+ "eval_mse3_loss": 0.04034361798070006,
455
+ "eval_mse_loss": 1.2489553087555778,
456
+ "eval_rec_loss": 0.047009017791098624,
457
+ "eval_runtime": 150.3295,
458
+ "eval_samples_per_second": 199.562,
459
+ "eval_steps_per_second": 3.12,
460
+ "eval_var_loss": 0.01723895594080501,
461
+ "flow/cos_sim": 0.5604663359712182,
462
+ "flow/improvement_ratio": 0.8947566227872235,
463
+ "flow/mag_ratio_mean": 0.5714088110273072,
464
+ "flow/mag_ratio_std": 0.2653434300092238,
465
+ "step": 7168
466
+ },
467
+ {
468
+ "epoch": 0.3199517314198289,
469
+ "grad_norm": 1.4121514558792114,
470
+ "learning_rate": 0.0007880720885100349,
471
+ "loss": 1.4375568628311157,
472
+ "step": 7424
473
+ },
474
+ {
475
+ "epoch": 0.3309845497446506,
476
+ "grad_norm": 0.9962936639785767,
477
+ "learning_rate": 0.0007734100250498788,
478
+ "loss": 1.435603141784668,
479
+ "step": 7680
480
+ },
481
+ {
482
+ "epoch": 0.3420173680694723,
483
+ "grad_norm": 1.7380033731460571,
484
+ "learning_rate": 0.000758404559368781,
485
+ "loss": 1.436031460762024,
486
+ "step": 7936
487
+ },
488
+ {
489
+ "epoch": 0.353050186394294,
490
+ "grad_norm": 0.9169597625732422,
491
+ "learning_rate": 0.0007430745382893488,
492
+ "loss": 1.430087685585022,
493
+ "step": 8192
494
+ },
495
+ {
496
+ "epoch": 0.353050186394294,
497
+ "eval_bleu": 0.9373010926402804,
498
+ "eval_cos_loss": 0.4382981645908437,
499
+ "eval_dec_loss": 0.11403963093492966,
500
+ "eval_loss": 1.4327775341615494,
501
+ "eval_mse2_loss": 0.14718564421828115,
502
+ "eval_mse3_loss": 0.040787868026985544,
503
+ "eval_mse_loss": 1.244804022917107,
504
+ "eval_rec_loss": 0.047009017791098624,
505
+ "eval_var_loss": 0.01723895594080501,
506
+ "flow/cos_sim": 0.5617018342018127,
507
+ "flow/improvement_ratio": 0.8961344680298112,
508
+ "flow/mag_ratio_mean": 0.5686201650196555,
509
+ "flow/mag_ratio_std": 0.26075691680537105,
510
+ "step": 8192
511
+ },
512
+ {
513
+ "epoch": 0.353050186394294,
514
+ "eval_bleu": 0.9373010926402804,
515
+ "eval_cos_loss": 0.4382981645908437,
516
+ "eval_dec_loss": 0.11403963093492966,
517
+ "eval_loss": 1.4327775341615494,
518
+ "eval_mse2_loss": 0.14718564421828115,
519
+ "eval_mse3_loss": 0.040787868026985544,
520
+ "eval_mse_loss": 1.244804022917107,
521
+ "eval_rec_loss": 0.047009017791098624,
522
+ "eval_runtime": 151.6067,
523
+ "eval_samples_per_second": 197.88,
524
+ "eval_steps_per_second": 3.094,
525
+ "eval_var_loss": 0.01723895594080501,
526
+ "flow/cos_sim": 0.5617018342018127,
527
+ "flow/improvement_ratio": 0.8961344680298112,
528
+ "flow/mag_ratio_mean": 0.5686201650196555,
529
+ "flow/mag_ratio_std": 0.26075691680537105,
530
+ "step": 8192
531
+ },
532
+ {
533
+ "epoch": 0.36408300471911564,
534
+ "grad_norm": 1.2372357845306396,
535
+ "learning_rate": 0.0007274392162748551,
536
+ "loss": 1.433260440826416,
537
+ "step": 8448
538
+ },
539
+ {
540
+ "epoch": 0.3751158230439373,
541
+ "grad_norm": 0.7132707834243774,
542
+ "learning_rate": 0.000711518231245687,
543
+ "loss": 1.4292922019958496,
544
+ "step": 8704
545
+ },
546
+ {
547
+ "epoch": 0.38614864136875904,
548
+ "grad_norm": 0.933404266834259,
549
+ "learning_rate": 0.0006953315799141723,
550
+ "loss": 1.428688645362854,
551
+ "step": 8960
552
+ },
553
+ {
554
+ "epoch": 0.3971814596935807,
555
+ "grad_norm": 1.2554126977920532,
556
+ "learning_rate": 0.0006788995926687669,
557
+ "loss": 1.425850510597229,
558
+ "step": 9216
559
+ },
560
+ {
561
+ "epoch": 0.3971814596935807,
562
+ "eval_bleu": 0.9372418767062352,
563
+ "eval_cos_loss": 0.43612730547563355,
564
+ "eval_dec_loss": 0.11531878670633856,
565
+ "eval_loss": 1.4266213120173799,
566
+ "eval_mse2_loss": 0.1465762988813142,
567
+ "eval_mse3_loss": 0.040784004980376536,
568
+ "eval_mse_loss": 1.2392610072581245,
569
+ "eval_rec_loss": 0.047009017791098624,
570
+ "eval_var_loss": 0.01723895594080501,
571
+ "flow/cos_sim": 0.5638726928086677,
572
+ "flow/improvement_ratio": 0.8970644686267828,
573
+ "flow/mag_ratio_mean": 0.5689206580871712,
574
+ "flow/mag_ratio_std": 0.25856476135726675,
575
+ "step": 9216
576
+ },
577
+ {
578
+ "epoch": 0.3971814596935807,
579
+ "eval_bleu": 0.9372418767062352,
580
+ "eval_cos_loss": 0.43612730547563355,
581
+ "eval_dec_loss": 0.11531878670633856,
582
+ "eval_loss": 1.4266213120173799,
583
+ "eval_mse2_loss": 0.1465762988813142,
584
+ "eval_mse3_loss": 0.040784004980376536,
585
+ "eval_mse_loss": 1.2392610072581245,
586
+ "eval_rec_loss": 0.047009017791098624,
587
+ "eval_runtime": 150.6594,
588
+ "eval_samples_per_second": 199.125,
589
+ "eval_steps_per_second": 3.113,
590
+ "eval_var_loss": 0.01723895594080501,
591
+ "flow/cos_sim": 0.5638726928086677,
592
+ "flow/improvement_ratio": 0.8970644686267828,
593
+ "flow/mag_ratio_mean": 0.5689206580871712,
594
+ "flow/mag_ratio_std": 0.25856476135726675,
595
+ "step": 9216
596
+ },
597
+ {
598
+ "epoch": 0.4082142780184024,
599
+ "grad_norm": 1.1525744199752808,
600
+ "learning_rate": 0.0006622429080391422,
601
+ "loss": 1.429610252380371,
602
+ "step": 9472
603
+ },
604
+ {
605
+ "epoch": 0.4192470963432241,
606
+ "grad_norm": 0.7822180986404419,
607
+ "learning_rate": 0.0006453824467742515,
608
+ "loss": 1.424994945526123,
609
+ "step": 9728
610
+ },
611
+ {
612
+ "epoch": 0.43027991466804577,
613
+ "grad_norm": 0.5769438743591309,
614
+ "learning_rate": 0.0006283393855659275,
615
+ "loss": 1.4268593788146973,
616
+ "step": 9984
617
+ },
618
+ {
619
+ "epoch": 0.44131273299286744,
620
+ "grad_norm": 1.1103806495666504,
621
+ "learning_rate": 0.0006111351304510173,
622
+ "loss": 1.4214195013046265,
623
+ "step": 10240
624
+ },
625
+ {
626
+ "epoch": 0.44131273299286744,
627
+ "eval_bleu": 0.9379655406873967,
628
+ "eval_cos_loss": 0.4348486333386476,
629
+ "eval_dec_loss": 0.11333615507986118,
630
+ "eval_loss": 1.423404996329025,
631
+ "eval_mse2_loss": 0.14630243560271478,
632
+ "eval_mse3_loss": 0.0406117777207068,
633
+ "eval_mse_loss": 1.2364907806107739,
634
+ "eval_rec_loss": 0.047009017791098624,
635
+ "eval_var_loss": 0.01723895594080501,
636
+ "flow/cos_sim": 0.5651513632934994,
637
+ "flow/improvement_ratio": 0.8989848929173403,
638
+ "flow/mag_ratio_mean": 0.5767871759085259,
639
+ "flow/mag_ratio_std": 0.2627801252390022,
640
+ "step": 10240
641
+ },
642
+ {
643
+ "epoch": 0.44131273299286744,
644
+ "eval_bleu": 0.9379655406873967,
645
+ "eval_cos_loss": 0.4348486333386476,
646
+ "eval_dec_loss": 0.11333615507986118,
647
+ "eval_loss": 1.423404996329025,
648
+ "eval_mse2_loss": 0.14630243560271478,
649
+ "eval_mse3_loss": 0.0406117777207068,
650
+ "eval_mse_loss": 1.2364907806107739,
651
+ "eval_rec_loss": 0.047009017791098624,
652
+ "eval_runtime": 150.5926,
653
+ "eval_samples_per_second": 199.213,
654
+ "eval_steps_per_second": 3.114,
655
+ "eval_var_loss": 0.01723895594080501,
656
+ "flow/cos_sim": 0.5651513632934994,
657
+ "flow/improvement_ratio": 0.8989848929173403,
658
+ "flow/mag_ratio_mean": 0.5767871759085259,
659
+ "flow/mag_ratio_std": 0.2627801252390022,
660
+ "step": 10240
661
+ },
662
+ {
663
+ "epoch": 0.45234555131768917,
664
+ "grad_norm": 0.9379155039787292,
665
+ "learning_rate": 0.0005937912899254605,
666
+ "loss": 1.4213385581970215,
667
+ "step": 10496
668
+ },
669
+ {
670
+ "epoch": 0.46337836964251083,
671
+ "grad_norm": 0.7825599312782288,
672
+ "learning_rate": 0.0005763296478040787,
673
+ "loss": 1.4202309846878052,
674
+ "step": 10752
675
+ },
676
+ {
677
+ "epoch": 0.4744111879673325,
678
+ "grad_norm": 0.9089685082435608,
679
+ "learning_rate": 0.0005587721358601663,
680
+ "loss": 1.4216351509094238,
681
+ "step": 11008
682
+ },
683
+ {
684
+ "epoch": 0.4854440062921542,
685
+ "grad_norm": 0.8983961939811707,
686
+ "learning_rate": 0.0005411408062792448,
687
+ "loss": 1.4181705713272095,
688
+ "step": 11264
689
+ },
690
+ {
691
+ "epoch": 0.4854440062921542,
692
+ "eval_bleu": 0.936860898611364,
693
+ "eval_cos_loss": 0.43602795717812803,
694
+ "eval_dec_loss": 0.11359805543261614,
695
+ "eval_loss": 1.4229239845580892,
696
+ "eval_mse2_loss": 0.1442256863596343,
697
+ "eval_mse3_loss": 0.04007743425897634,
698
+ "eval_mse_loss": 1.2386208620152748,
699
+ "eval_rec_loss": 0.047009017791098624,
700
+ "eval_var_loss": 0.01723895594080501,
701
+ "flow/cos_sim": 0.5639720439656711,
702
+ "flow/improvement_ratio": 0.8950573275846713,
703
+ "flow/mag_ratio_mean": 0.5711015071441878,
704
+ "flow/mag_ratio_std": 0.2601570950896501,
705
+ "step": 11264
706
+ },
707
+ {
708
+ "epoch": 0.4854440062921542,
709
+ "eval_bleu": 0.936860898611364,
710
+ "eval_cos_loss": 0.43602795717812803,
711
+ "eval_dec_loss": 0.11359805543261614,
712
+ "eval_loss": 1.4229239845580892,
713
+ "eval_mse2_loss": 0.1442256863596343,
714
+ "eval_mse3_loss": 0.04007743425897634,
715
+ "eval_mse_loss": 1.2386208620152748,
716
+ "eval_rec_loss": 0.047009017791098624,
717
+ "eval_runtime": 149.9228,
718
+ "eval_samples_per_second": 200.103,
719
+ "eval_steps_per_second": 3.128,
720
+ "eval_var_loss": 0.01723895594080501,
721
+ "flow/cos_sim": 0.5639720439656711,
722
+ "flow/improvement_ratio": 0.8950573275846713,
723
+ "flow/mag_ratio_mean": 0.5711015071441878,
724
+ "flow/mag_ratio_std": 0.2601570950896501,
725
+ "step": 11264
726
+ },
727
+ {
728
+ "epoch": 0.4964768246169759,
729
+ "grad_norm": 0.8639885783195496,
730
+ "learning_rate": 0.0005234578039615789,
731
+ "loss": 1.4164997339248657,
732
+ "step": 11520
733
+ },
734
+ {
735
+ "epoch": 0.5075096429417976,
736
+ "grad_norm": 1.0776760578155518,
737
+ "learning_rate": 0.0005057453387082458,
738
+ "loss": 1.41534423828125,
739
+ "step": 11776
740
+ },
741
+ {
742
+ "epoch": 0.5185424612666193,
743
+ "grad_norm": 1.164801001548767,
744
+ "learning_rate": 0.0004880256573256866,
745
+ "loss": 1.417317509651184,
746
+ "step": 12032
747
+ },
748
+ {
749
+ "epoch": 0.5295752795914409,
750
+ "grad_norm": 0.853115439414978,
751
+ "learning_rate": 0.0004703210156837805,
752
+ "loss": 1.4166315793991089,
753
+ "step": 12288
754
+ },
755
+ {
756
+ "epoch": 0.5295752795914409,
757
+ "eval_bleu": 0.9380693394475813,
758
+ "eval_cos_loss": 0.43475201853048573,
759
+ "eval_dec_loss": 0.11086099378979092,
760
+ "eval_loss": 1.4183173507515556,
761
+ "eval_mse2_loss": 0.14407628963687527,
762
+ "eval_mse3_loss": 0.03990168983081002,
763
+ "eval_mse_loss": 1.2343393711647246,
764
+ "eval_rec_loss": 0.047009017791098624,
765
+ "eval_var_loss": 0.01723895594080501,
766
+ "flow/cos_sim": 0.5652479861082553,
767
+ "flow/improvement_ratio": 0.896212916130196,
768
+ "flow/mag_ratio_mean": 0.5760958854323511,
769
+ "flow/mag_ratio_std": 0.2636355218539106,
770
+ "step": 12288
771
+ },
772
+ {
773
+ "epoch": 0.5295752795914409,
774
+ "eval_bleu": 0.9380693394475813,
775
+ "eval_cos_loss": 0.43475201853048573,
776
+ "eval_dec_loss": 0.11086099378979092,
777
+ "eval_loss": 1.4183173507515556,
778
+ "eval_mse2_loss": 0.14407628963687527,
779
+ "eval_mse3_loss": 0.03990168983081002,
780
+ "eval_mse_loss": 1.2343393711647246,
781
+ "eval_rec_loss": 0.047009017791098624,
782
+ "eval_runtime": 151.7991,
783
+ "eval_samples_per_second": 197.63,
784
+ "eval_steps_per_second": 3.09,
785
+ "eval_var_loss": 0.01723895594080501,
786
+ "flow/cos_sim": 0.5652479861082553,
787
+ "flow/improvement_ratio": 0.896212916130196,
788
+ "flow/mag_ratio_mean": 0.5760958854323511,
789
+ "flow/mag_ratio_std": 0.2636355218539106,
790
+ "step": 12288
791
+ },
792
+ {
793
+ "epoch": 0.5406080979162626,
794
+ "grad_norm": 0.5429331064224243,
795
+ "learning_rate": 0.0004526536507625343,
796
+ "loss": 1.4145379066467285,
797
+ "step": 12544
798
+ },
799
+ {
800
+ "epoch": 0.5516409162410844,
801
+ "grad_norm": 1.1079273223876953,
802
+ "learning_rate": 0.00043504575272249973,
803
+ "loss": 1.4163099527359009,
804
+ "step": 12800
805
+ },
806
+ {
807
+ "epoch": 0.562673734565906,
808
+ "grad_norm": 1.1678777933120728,
809
+ "learning_rate": 0.0004175194370339921,
810
+ "loss": 1.4152926206588745,
811
+ "step": 13056
812
+ },
813
+ {
814
+ "epoch": 0.5737065528907277,
815
+ "grad_norm": 0.950139045715332,
816
+ "learning_rate": 0.0004000967167001243,
817
+ "loss": 1.4137598276138306,
818
+ "step": 13312
819
+ },
820
+ {
821
+ "epoch": 0.5737065528907277,
822
+ "eval_bleu": 0.9388806472418573,
823
+ "eval_cos_loss": 0.4328056685070493,
824
+ "eval_dec_loss": 0.10686975376589149,
825
+ "eval_loss": 1.4127923059565173,
826
+ "eval_mse2_loss": 0.14246540646880929,
827
+ "eval_mse3_loss": 0.03924089211867308,
828
+ "eval_mse_loss": 1.231086006296723,
829
+ "eval_rec_loss": 0.047009017791098624,
830
+ "eval_var_loss": 0.01723895594080501,
831
+ "flow/cos_sim": 0.5671943300314295,
832
+ "flow/improvement_ratio": 0.8961777138049161,
833
+ "flow/mag_ratio_mean": 0.5744174228294063,
834
+ "flow/mag_ratio_std": 0.2622520730121812,
835
+ "step": 13312
836
+ },
837
+ {
838
+ "epoch": 0.5737065528907277,
839
+ "eval_bleu": 0.9388806472418573,
840
+ "eval_cos_loss": 0.4328056685070493,
841
+ "eval_dec_loss": 0.10686975376589149,
842
+ "eval_loss": 1.4127923059565173,
843
+ "eval_mse2_loss": 0.14246540646880929,
844
+ "eval_mse3_loss": 0.03924089211867308,
845
+ "eval_mse_loss": 1.231086006296723,
846
+ "eval_rec_loss": 0.047009017791098624,
847
+ "eval_runtime": 151.3818,
848
+ "eval_samples_per_second": 198.174,
849
+ "eval_steps_per_second": 3.098,
850
+ "eval_var_loss": 0.01723895594080501,
851
+ "flow/cos_sim": 0.5671943300314295,
852
+ "flow/improvement_ratio": 0.8961777138049161,
853
+ "flow/mag_ratio_mean": 0.5744174228294063,
854
+ "flow/mag_ratio_std": 0.2622520730121812,
855
+ "step": 13312
856
+ },
857
+ {
858
+ "epoch": 0.5847393712155494,
859
+ "grad_norm": 0.7284323573112488,
860
+ "learning_rate": 0.00038279947460853446,
861
+ "loss": 1.4107180833816528,
862
+ "step": 13568
863
+ },
864
+ {
865
+ "epoch": 0.595772189540371,
866
+ "grad_norm": 1.2418670654296875,
867
+ "learning_rate": 0.00036564943604654345,
868
+ "loss": 1.410542368888855,
869
+ "step": 13824
870
+ },
871
+ {
872
+ "epoch": 0.6068050078651928,
873
+ "grad_norm": 1.182166576385498,
874
+ "learning_rate": 0.00034866814141425254,
875
+ "loss": 1.4119616746902466,
876
+ "step": 14080
877
+ },
878
+ {
879
+ "epoch": 0.6178378261900145,
880
+ "grad_norm": 0.7055562138557434,
881
+ "learning_rate": 0.0003318769191698637,
882
+ "loss": 1.4102239608764648,
883
+ "step": 14336
884
+ },
885
+ {
886
+ "epoch": 0.6178378261900145,
887
+ "eval_bleu": 0.9391033205296514,
888
+ "eval_cos_loss": 0.4321882768607597,
889
+ "eval_dec_loss": 0.10784589936202174,
890
+ "eval_loss": 1.4102330451835192,
891
+ "eval_mse2_loss": 0.14264666664003056,
892
+ "eval_mse3_loss": 0.03934626972306766,
893
+ "eval_mse_loss": 1.2282401104725755,
894
+ "eval_rec_loss": 0.047009017791098624,
895
+ "eval_var_loss": 0.01723895594080501,
896
+ "flow/cos_sim": 0.5678117201526536,
897
+ "flow/improvement_ratio": 0.8967333109394066,
898
+ "flow/mag_ratio_mean": 0.5778467524280426,
899
+ "flow/mag_ratio_std": 0.2675150448897246,
900
+ "step": 14336
901
+ },
902
+ {
903
+ "epoch": 0.6178378261900145,
904
+ "eval_bleu": 0.9391033205296514,
905
+ "eval_cos_loss": 0.4321882768607597,
906
+ "eval_dec_loss": 0.10784589936202174,
907
+ "eval_loss": 1.4102330451835192,
908
+ "eval_mse2_loss": 0.14264666664003056,
909
+ "eval_mse3_loss": 0.03934626972306766,
910
+ "eval_mse_loss": 1.2282401104725755,
911
+ "eval_rec_loss": 0.047009017791098624,
912
+ "eval_runtime": 152.545,
913
+ "eval_samples_per_second": 196.663,
914
+ "eval_steps_per_second": 3.075,
915
+ "eval_var_loss": 0.01723895594080501,
916
+ "flow/cos_sim": 0.5678117201526536,
917
+ "flow/improvement_ratio": 0.8967333109394066,
918
+ "flow/mag_ratio_mean": 0.5778467524280426,
919
+ "flow/mag_ratio_std": 0.2675150448897246,
920
+ "step": 14336
921
+ },
922
+ {
923
+ "epoch": 0.6288706445148361,
924
+ "grad_norm": 1.5477943420410156,
925
+ "learning_rate": 0.00031529685904119485,
926
+ "loss": 1.4078161716461182,
927
+ "step": 14592
928
+ },
929
+ {
930
+ "epoch": 0.6399034628396578,
931
+ "grad_norm": 1.0092837810516357,
932
+ "learning_rate": 0.0002989487855370421,
933
+ "loss": 1.4115574359893799,
934
+ "step": 14848
935
+ },
936
+ {
937
+ "epoch": 0.6509362811644795,
938
+ "grad_norm": 0.9162316918373108,
939
+ "learning_rate": 0.00028285323179165424,
940
+ "loss": 1.4091846942901611,
941
+ "step": 15104
942
+ },
943
+ {
944
+ "epoch": 0.6619690994893012,
945
+ "grad_norm": 0.5473514199256897,
946
+ "learning_rate": 0.0002670304137751759,
947
+ "loss": 1.4136096239089966,
948
+ "step": 15360
949
+ },
950
+ {
951
+ "epoch": 0.6619690994893012,
952
+ "eval_bleu": 0.9371399350692442,
953
+ "eval_cos_loss": 0.43145928612904255,
954
+ "eval_dec_loss": 0.11678189866201107,
955
+ "eval_loss": 1.412680120102124,
956
+ "eval_mse2_loss": 0.14467608757110548,
957
+ "eval_mse3_loss": 0.040516993604791066,
958
+ "eval_mse_loss": 1.227487043785388,
959
+ "eval_rec_loss": 0.047009017791098624,
960
+ "eval_var_loss": 0.01723895594080501,
961
+ "flow/cos_sim": 0.5685407153324786,
962
+ "flow/improvement_ratio": 0.8939294283832314,
963
+ "flow/mag_ratio_mean": 0.5773617385038688,
964
+ "flow/mag_ratio_std": 0.2658700548064734,
965
+ "step": 15360
966
+ },
967
+ {
968
+ "epoch": 0.6619690994893012,
969
+ "eval_bleu": 0.9371399350692442,
970
+ "eval_cos_loss": 0.43145928612904255,
971
+ "eval_dec_loss": 0.11678189866201107,
972
+ "eval_loss": 1.412680120102124,
973
+ "eval_mse2_loss": 0.14467608757110548,
974
+ "eval_mse3_loss": 0.040516993604791066,
975
+ "eval_mse_loss": 1.227487043785388,
976
+ "eval_rec_loss": 0.047009017791098624,
977
+ "eval_runtime": 153.1626,
978
+ "eval_samples_per_second": 195.87,
979
+ "eval_steps_per_second": 3.062,
980
+ "eval_var_loss": 0.01723895594080501,
981
+ "flow/cos_sim": 0.5685407153324786,
982
+ "flow/improvement_ratio": 0.8939294283832314,
983
+ "flow/mag_ratio_mean": 0.5773617385038688,
984
+ "flow/mag_ratio_std": 0.2658700548064734,
985
+ "step": 15360
986
+ },
987
+ {
988
+ "epoch": 0.6730019178141229,
989
+ "grad_norm": 1.117191195487976,
990
+ "learning_rate": 0.0002515002049024435,
991
+ "loss": 1.409903645515442,
992
+ "step": 15616
993
+ },
994
+ {
995
+ "epoch": 0.6840347361389446,
996
+ "grad_norm": 0.9312120079994202,
997
+ "learning_rate": 0.00023628211107203429,
998
+ "loss": 1.4103206396102905,
999
+ "step": 15872
1000
+ },
1001
+ {
1002
+ "epoch": 0.6950675544637662,
1003
+ "grad_norm": 0.6241945028305054,
1004
+ "learning_rate": 0.00022139524616691188,
1005
+ "loss": 1.4096852540969849,
1006
+ "step": 16128
1007
+ },
1008
+ {
1009
+ "epoch": 0.706100372788588,
1010
+ "grad_norm": 1.1047911643981934,
1011
+ "learning_rate": 0.000206858308047443,
1012
+ "loss": 1.4056518077850342,
1013
+ "step": 16384
1014
+ },
1015
+ {
1016
+ "epoch": 0.706100372788588,
1017
+ "eval_bleu": 0.939369810755522,
1018
+ "eval_cos_loss": 0.4307268185656208,
1019
+ "eval_dec_loss": 0.10748835562515868,
1020
+ "eval_loss": 1.4051913678773176,
1021
+ "eval_mse2_loss": 0.14105384695186798,
1022
+ "eval_mse3_loss": 0.03896740539225815,
1023
+ "eval_mse_loss": 1.2251701146554845,
1024
+ "eval_rec_loss": 0.047009017791098624,
1025
+ "eval_var_loss": 0.01723895594080501,
1026
+ "flow/cos_sim": 0.5692731790196921,
1027
+ "flow/improvement_ratio": 0.8951106891194894,
1028
+ "flow/mag_ratio_mean": 0.576439301342344,
1029
+ "flow/mag_ratio_std": 0.26492403600134573,
1030
+ "step": 16384
1031
+ },
1032
+ {
1033
+ "epoch": 0.706100372788588,
1034
+ "eval_bleu": 0.939369810755522,
1035
+ "eval_cos_loss": 0.4307268185656208,
1036
+ "eval_dec_loss": 0.10748835562515868,
1037
+ "eval_loss": 1.4051913678773176,
1038
+ "eval_mse2_loss": 0.14105384695186798,
1039
+ "eval_mse3_loss": 0.03896740539225815,
1040
+ "eval_mse_loss": 1.2251701146554845,
1041
+ "eval_rec_loss": 0.047009017791098624,
1042
+ "eval_runtime": 151.3979,
1043
+ "eval_samples_per_second": 198.153,
1044
+ "eval_steps_per_second": 3.098,
1045
+ "eval_var_loss": 0.01723895594080501,
1046
+ "flow/cos_sim": 0.5692731790196921,
1047
+ "flow/improvement_ratio": 0.8951106891194894,
1048
+ "flow/mag_ratio_mean": 0.576439301342344,
1049
+ "flow/mag_ratio_std": 0.26492403600134573,
1050
+ "step": 16384
1051
+ },
1052
+ {
1053
+ "epoch": 0.7171331911134096,
1054
+ "grad_norm": 0.6667467355728149,
1055
+ "learning_rate": 0.00019268955506693798,
1056
+ "loss": 1.4079476594924927,
1057
+ "step": 16640
1058
+ },
1059
+ {
1060
+ "epoch": 0.7281660094382313,
1061
+ "grad_norm": 1.067337989807129,
1062
+ "learning_rate": 0.00017890678313921,
1063
+ "loss": 1.4071507453918457,
1064
+ "step": 16896
1065
+ },
1066
+ {
1067
+ "epoch": 0.739198827763053,
1068
+ "grad_norm": 1.381058692932129,
1069
+ "learning_rate": 0.00016552730338695792,
1070
+ "loss": 1.4063572883605957,
1071
+ "step": 17152
1072
+ },
1073
+ {
1074
+ "epoch": 0.7502316460878746,
1075
+ "grad_norm": 1.0033129453659058,
1076
+ "learning_rate": 0.00015256792039904465,
1077
+ "loss": 1.404827356338501,
1078
+ "step": 17408
1079
+ },
1080
+ {
1081
+ "epoch": 0.7502316460878746,
1082
+ "eval_bleu": 0.9408599959594597,
1083
+ "eval_cos_loss": 0.4294143016023168,
1084
+ "eval_dec_loss": 0.10632320484722346,
1085
+ "eval_loss": 1.4002188098456052,
1086
+ "eval_mse2_loss": 0.14073645838224558,
1087
+ "eval_mse3_loss": 0.038728228359143614,
1088
+ "eval_mse_loss": 1.2207541257333654,
1089
+ "eval_rec_loss": 0.047009017791098624,
1090
+ "eval_var_loss": 0.01723895594080501,
1091
+ "flow/cos_sim": 0.5705856997321155,
1092
+ "flow/improvement_ratio": 0.8967551362794092,
1093
+ "flow/mag_ratio_mean": 0.579864633871294,
1094
+ "flow/mag_ratio_std": 0.2687839522544763,
1095
+ "step": 17408
1096
+ },
1097
+ {
1098
+ "epoch": 0.7502316460878746,
1099
+ "eval_bleu": 0.9408599959594597,
1100
+ "eval_cos_loss": 0.4294143016023168,
1101
+ "eval_dec_loss": 0.10632320484722346,
1102
+ "eval_loss": 1.4002188098456052,
1103
+ "eval_mse2_loss": 0.14073645838224558,
1104
+ "eval_mse3_loss": 0.038728228359143614,
1105
+ "eval_mse_loss": 1.2207541257333654,
1106
+ "eval_rec_loss": 0.047009017791098624,
1107
+ "eval_runtime": 151.5248,
1108
+ "eval_samples_per_second": 197.987,
1109
+ "eval_steps_per_second": 3.095,
1110
+ "eval_var_loss": 0.01723895594080501,
1111
+ "flow/cos_sim": 0.5705856997321155,
1112
+ "flow/improvement_ratio": 0.8967551362794092,
1113
+ "flow/mag_ratio_mean": 0.579864633871294,
1114
+ "flow/mag_ratio_std": 0.2687839522544763,
1115
+ "step": 17408
1116
+ },
1117
+ {
1118
+ "epoch": 0.7612644644126964,
1119
+ "grad_norm": 0.7275887727737427,
1120
+ "learning_rate": 0.00014004491112398103,
1121
+ "loss": 1.4066880941390991,
1122
+ "step": 17664
1123
+ },
1124
+ {
1125
+ "epoch": 0.7722972827375181,
1126
+ "grad_norm": 0.4016956686973572,
1127
+ "learning_rate": 0.00012797400442612433,
1128
+ "loss": 1.4000524282455444,
1129
+ "step": 17920
1130
+ },
1131
+ {
1132
+ "epoch": 0.7833301010623397,
1133
+ "grad_norm": 0.6152352690696716,
1134
+ "learning_rate": 0.00011637036133026895,
1135
+ "loss": 1.4042223691940308,
1136
+ "step": 18176
1137
+ },
1138
+ {
1139
+ "epoch": 0.7943629193871614,
1140
+ "grad_norm": 1.1916025876998901,
1141
+ "learning_rate": 0.00010524855597944216,
1142
+ "loss": 1.4021508693695068,
1143
+ "step": 18432
1144
+ },
1145
+ {
1146
+ "epoch": 0.7943629193871614,
1147
+ "eval_bleu": 0.9395503562023391,
1148
+ "eval_cos_loss": 0.43083440838083786,
1149
+ "eval_dec_loss": 0.10896516631621478,
1150
+ "eval_loss": 1.4056353792707041,
1151
+ "eval_mse2_loss": 0.14135168713610818,
1152
+ "eval_mse3_loss": 0.03911572252350575,
1153
+ "eval_mse_loss": 1.225167971175871,
1154
+ "eval_rec_loss": 0.047009017791098624,
1155
+ "eval_var_loss": 0.01723895594080501,
1156
+ "flow/cos_sim": 0.5691655881877647,
1157
+ "flow/improvement_ratio": 0.8966002826497499,
1158
+ "flow/mag_ratio_mean": 0.5784899838951859,
1159
+ "flow/mag_ratio_std": 0.26509309523522473,
1160
+ "step": 18432
1161
+ },
1162
+ {
1163
+ "epoch": 0.7943629193871614,
1164
+ "eval_bleu": 0.9395503562023391,
1165
+ "eval_cos_loss": 0.43083440838083786,
1166
+ "eval_dec_loss": 0.10896516631621478,
1167
+ "eval_loss": 1.4056353792707041,
1168
+ "eval_mse2_loss": 0.14135168713610818,
1169
+ "eval_mse3_loss": 0.03911572252350575,
1170
+ "eval_mse_loss": 1.225167971175871,
1171
+ "eval_rec_loss": 0.047009017791098624,
1172
+ "eval_runtime": 153.6467,
1173
+ "eval_samples_per_second": 195.253,
1174
+ "eval_steps_per_second": 3.052,
1175
+ "eval_var_loss": 0.01723895594080501,
1176
+ "flow/cos_sim": 0.5691655881877647,
1177
+ "flow/improvement_ratio": 0.8966002826497499,
1178
+ "flow/mag_ratio_mean": 0.5784899838951859,
1179
+ "flow/mag_ratio_std": 0.26509309523522473,
1180
+ "step": 18432
1181
+ },
1182
+ {
1183
+ "epoch": 0.8053957377119831,
1184
+ "grad_norm": 0.8944941759109497,
1185
+ "learning_rate": 9.462255732982089e-05,
1186
+ "loss": 1.4011187553405762,
1187
+ "step": 18688
1188
+ },
1189
+ {
1190
+ "epoch": 0.8164285560368048,
1191
+ "grad_norm": 0.6294699311256409,
1192
+ "learning_rate": 8.450571160576348e-05,
1193
+ "loss": 1.4046047925949097,
1194
+ "step": 18944
1195
+ },
1196
+ {
1197
+ "epoch": 0.8274613743616265,
1198
+ "grad_norm": 0.4316425323486328,
1199
+ "learning_rate": 7.491072553698764e-05,
1200
+ "loss": 1.4013915061950684,
1201
+ "step": 19200
1202
+ },
1203
+ {
1204
+ "epoch": 0.8384941926864482,
1205
+ "grad_norm": 0.42900189757347107,
1206
+ "learning_rate": 6.584965039895586e-05,
1207
+ "loss": 1.398647427558899,
1208
+ "step": 19456
1209
+ },
1210
+ {
1211
+ "epoch": 0.8384941926864482,
1212
+ "eval_bleu": 0.9387487761199217,
1213
+ "eval_cos_loss": 0.4305588583320951,
1214
+ "eval_dec_loss": 0.10771015434186341,
1215
+ "eval_loss": 1.4043200140568748,
1216
+ "eval_mse2_loss": 0.14159503931811115,
1217
+ "eval_mse3_loss": 0.03919749533030779,
1218
+ "eval_mse_loss": 1.2235274815610222,
1219
+ "eval_rec_loss": 0.047009017791098624,
1220
+ "eval_var_loss": 0.01723895594080501,
1221
+ "flow/cos_sim": 0.5694411402063837,
1222
+ "flow/improvement_ratio": 0.8948449469578545,
1223
+ "flow/mag_ratio_mean": 0.5805903251237198,
1224
+ "flow/mag_ratio_std": 0.267388239534679,
1225
+ "step": 19456
1226
+ },
1227
+ {
1228
+ "epoch": 0.8384941926864482,
1229
+ "eval_bleu": 0.9387487761199217,
1230
+ "eval_cos_loss": 0.4305588583320951,
1231
+ "eval_dec_loss": 0.10771015434186341,
1232
+ "eval_loss": 1.4043200140568748,
1233
+ "eval_mse2_loss": 0.14159503931811115,
1234
+ "eval_mse3_loss": 0.03919749533030779,
1235
+ "eval_mse_loss": 1.2235274815610222,
1236
+ "eval_rec_loss": 0.047009017791098624,
1237
+ "eval_runtime": 152.1206,
1238
+ "eval_samples_per_second": 197.212,
1239
+ "eval_steps_per_second": 3.083,
1240
+ "eval_var_loss": 0.01723895594080501,
1241
+ "flow/cos_sim": 0.5694411402063837,
1242
+ "flow/improvement_ratio": 0.8948449469578545,
1243
+ "flow/mag_ratio_mean": 0.5805903251237198,
1244
+ "flow/mag_ratio_std": 0.267388239534679,
1245
+ "step": 19456
1246
+ },
1247
+ {
1248
+ "epoch": 0.8495270110112698,
1249
+ "grad_norm": 0.797498881816864,
1250
+ "learning_rate": 5.73338668765051e-05,
1251
+ "loss": 1.4058088064193726,
1252
+ "step": 19712
1253
+ },
1254
+ {
1255
+ "epoch": 0.8605598293360915,
1256
+ "grad_norm": 0.463348925113678,
1257
+ "learning_rate": 4.9374070769740984e-05,
1258
+ "loss": 1.4032570123672485,
1259
+ "step": 19968
1260
+ },
1261
+ {
1262
+ "epoch": 0.8715926476609133,
1263
+ "grad_norm": 0.8888425827026367,
1264
+ "learning_rate": 4.198025956014095e-05,
1265
+ "loss": 1.403628945350647,
1266
+ "step": 20224
1267
+ },
1268
+ {
1269
+ "epoch": 0.8826254659857349,
1270
+ "grad_norm": 0.9965147376060486,
1271
+ "learning_rate": 3.516171985374755e-05,
1272
+ "loss": 1.404217004776001,
1273
+ "step": 20480
1274
+ },
1275
+ {
1276
+ "epoch": 0.8826254659857349,
1277
+ "eval_bleu": 0.9386200539251414,
1278
+ "eval_cos_loss": 0.43061384094803573,
1279
+ "eval_dec_loss": 0.1105148816930015,
1280
+ "eval_loss": 1.4051892691329597,
1281
+ "eval_mse2_loss": 0.1420527106758628,
1282
+ "eval_mse3_loss": 0.03952200293366207,
1283
+ "eval_mse_loss": 1.2236145594989314,
1284
+ "eval_rec_loss": 0.047009017791098624,
1285
+ "eval_var_loss": 0.01723895594080501,
1286
+ "flow/cos_sim": 0.5693861599415858,
1287
+ "flow/improvement_ratio": 0.8978289964356656,
1288
+ "flow/mag_ratio_mean": 0.5780887319080865,
1289
+ "flow/mag_ratio_std": 0.26583226638307955,
1290
+ "step": 20480
1291
+ },
1292
+ {
1293
+ "epoch": 0.8826254659857349,
1294
+ "eval_bleu": 0.9386200539251414,
1295
+ "eval_cos_loss": 0.43061384094803573,
1296
+ "eval_dec_loss": 0.1105148816930015,
1297
+ "eval_loss": 1.4051892691329597,
1298
+ "eval_mse2_loss": 0.1420527106758628,
1299
+ "eval_mse3_loss": 0.03952200293366207,
1300
+ "eval_mse_loss": 1.2236145594989314,
1301
+ "eval_rec_loss": 0.047009017791098624,
1302
+ "eval_runtime": 151.5082,
1303
+ "eval_samples_per_second": 198.009,
1304
+ "eval_steps_per_second": 3.096,
1305
+ "eval_var_loss": 0.01723895594080501,
1306
+ "flow/cos_sim": 0.5693861599415858,
1307
+ "flow/improvement_ratio": 0.8978289964356656,
1308
+ "flow/mag_ratio_mean": 0.5780887319080865,
1309
+ "flow/mag_ratio_std": 0.26583226638307955,
1310
+ "step": 20480
1311
+ }
1312
+ ],
1313
+ "logging_steps": 256,
1314
+ "max_steps": 23204,
1315
+ "num_input_tokens_seen": 0,
1316
+ "num_train_epochs": 1,
1317
+ "save_steps": 1024,
1318
+ "stateful_callbacks": {
1319
+ "TrainerControl": {
1320
+ "args": {
1321
+ "should_epoch_stop": false,
1322
+ "should_evaluate": false,
1323
+ "should_log": false,
1324
+ "should_save": true,
1325
+ "should_training_stop": false
1326
+ },
1327
+ "attributes": {}
1328
+ }
1329
+ },
1330
+ "total_flos": 0.0,
1331
+ "train_batch_size": 64,
1332
+ "trial_name": null,
1333
+ "trial_params": null
1334
+ }
checkpoints-d1.2/checkpoint-20480/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a16bb839f687414b8e48611327c4b9cfddeefe38c031ca70808f9a97c476b7
3
+ size 5137