crossroderick commited on
Commit
508f442
·
1 Parent(s): 70fdfe0

Fixed character mapping, training with 8 epochs

Browse files
Files changed (48) hide show
  1. .gitignore +2 -1
  2. checkpoints/{checkpoint-48000 → checkpoint-29500}/config.json +0 -0
  3. checkpoints/{checkpoint-48000 → checkpoint-29500}/generation_config.json +0 -0
  4. checkpoints/{checkpoint-48000 → checkpoint-29500}/model.safetensors +1 -1
  5. checkpoints/{checkpoint-48500 → checkpoint-29500}/optimizer.pt +1 -1
  6. checkpoints/{checkpoint-48500 → checkpoint-29500}/rng_state.pth +1 -1
  7. checkpoints/{checkpoint-48958 → checkpoint-29500}/scaler.pt +1 -1
  8. checkpoints/{checkpoint-48000 → checkpoint-29500}/scheduler.pt +1 -1
  9. checkpoints/{checkpoint-48000 → checkpoint-29500}/special_tokens_map.json +0 -0
  10. checkpoints/{checkpoint-48000 → checkpoint-29500}/spiece.model +0 -0
  11. checkpoints/{checkpoint-48000 → checkpoint-29500}/tokenizer.json +0 -0
  12. checkpoints/{checkpoint-48000 → checkpoint-29500}/tokenizer_config.json +0 -0
  13. checkpoints/checkpoint-29500/trainer_state.json +447 -0
  14. checkpoints/{checkpoint-48500 → checkpoint-29500}/training_args.bin +1 -1
  15. checkpoints/{checkpoint-48500 → checkpoint-30000}/config.json +0 -0
  16. checkpoints/{checkpoint-48500 → checkpoint-30000}/generation_config.json +0 -0
  17. checkpoints/{checkpoint-48500 → checkpoint-30000}/model.safetensors +1 -1
  18. checkpoints/{checkpoint-48000 → checkpoint-30000}/optimizer.pt +1 -1
  19. checkpoints/{checkpoint-48958 → checkpoint-30000}/rng_state.pth +1 -1
  20. checkpoints/{checkpoint-48500 → checkpoint-30000}/scaler.pt +1 -1
  21. checkpoints/{checkpoint-48500 → checkpoint-30000}/scheduler.pt +1 -1
  22. checkpoints/{checkpoint-48500 → checkpoint-30000}/special_tokens_map.json +0 -0
  23. checkpoints/{checkpoint-48500 → checkpoint-30000}/spiece.model +0 -0
  24. checkpoints/{checkpoint-48500 → checkpoint-30000}/tokenizer.json +0 -0
  25. checkpoints/{checkpoint-48500 → checkpoint-30000}/tokenizer_config.json +0 -0
  26. checkpoints/checkpoint-30000/trainer_state.json +454 -0
  27. checkpoints/{checkpoint-48000 → checkpoint-30000}/training_args.bin +1 -1
  28. checkpoints/{checkpoint-48958 → checkpoint-30128}/config.json +0 -0
  29. checkpoints/{checkpoint-48958 → checkpoint-30128}/generation_config.json +0 -0
  30. checkpoints/{checkpoint-48958 → checkpoint-30128}/model.safetensors +1 -1
  31. checkpoints/{checkpoint-48958 → checkpoint-30128}/optimizer.pt +1 -1
  32. checkpoints/{checkpoint-48000 → checkpoint-30128}/rng_state.pth +1 -1
  33. checkpoints/{checkpoint-48000 → checkpoint-30128}/scaler.pt +1 -1
  34. checkpoints/{checkpoint-48958 → checkpoint-30128}/scheduler.pt +1 -1
  35. checkpoints/{checkpoint-48958 → checkpoint-30128}/special_tokens_map.json +0 -0
  36. checkpoints/{checkpoint-48958 → checkpoint-30128}/spiece.model +0 -0
  37. checkpoints/{checkpoint-48958 → checkpoint-30128}/tokenizer.json +0 -0
  38. checkpoints/{checkpoint-48958 → checkpoint-30128}/tokenizer_config.json +0 -0
  39. checkpoints/checkpoint-30128/trainer_state.json +454 -0
  40. checkpoints/{checkpoint-48958 → checkpoint-30128}/training_args.bin +1 -1
  41. checkpoints/checkpoint-48000/trainer_state.json +0 -706
  42. checkpoints/checkpoint-48500/trainer_state.json +0 -713
  43. checkpoints/checkpoint-48958/trainer_state.json +0 -713
  44. model.safetensors +1 -1
  45. src/data/clean_corpus.jsonl +2 -2
  46. src/data/generate_cyr_lat_pairs.py +52 -21
  47. src/data/kazakh_latin_corpus.jsonl +2 -2
  48. src/train_t5.py +1 -1
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  /src/data/extracted
2
  /src/data/cached_lm_GPT2TokenizerFast_16_kazakh_latin_corpus.txt.lock
3
- /logs/**
 
 
1
  /src/data/extracted
2
  /src/data/cached_lm_GPT2TokenizerFast_16_kazakh_latin_corpus.txt.lock
3
+ /logs/**
4
+ /src/test_t5.py
checkpoints/{checkpoint-48000 → checkpoint-29500}/config.json RENAMED
File without changes
checkpoints/{checkpoint-48000 → checkpoint-29500}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-48000 → checkpoint-29500}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:448d7df6b5c8a8d5c909e3c4d89c4aa963ded7b56216c411ac30831d871a0c0f
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb28e980018170c065904f2e04da35523851bcb3c34f95dbf54242ff3e67ca2
3
  size 242041896
checkpoints/{checkpoint-48500 → checkpoint-29500}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5690c8f1770608957d58ee1669185db353a51865711ab0bf64130487c7819403
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e5e1319e885d7755f8d9cf8489fe3c33f3cb1b97a64f33e1bc04326c8b213d
3
  size 484163514
checkpoints/{checkpoint-48500 → checkpoint-29500}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddfcaf6362c874707d13984943bc1adea41c31767683dbe5609d23dc5ebfbeca
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcddfc80cf7328649cba003a5107f899a3a86e174508f8f643f832b2873f3582
3
  size 14244
checkpoints/{checkpoint-48958 → checkpoint-29500}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b7a5ccbbd174b024f0fc150e65ae009070562b75b60db0af53945cd22f7011f
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:289b2af7454fae01f03f6828f40c8444533c4ef735a5c5218220664ea8958116
3
  size 988
checkpoints/{checkpoint-48000 → checkpoint-29500}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0517e4908a64405450c76a0e1a824f08d0bbd55697d60f15520762091350a0a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5c0230314afe0b59d26d9f16bd32ebbe87088c9db65e48060f0f1bb2097f2e
3
  size 1064
checkpoints/{checkpoint-48000 → checkpoint-29500}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-48000 → checkpoint-29500}/spiece.model RENAMED
File without changes
checkpoints/{checkpoint-48000 → checkpoint-29500}/tokenizer.json RENAMED
File without changes
checkpoints/{checkpoint-48000 → checkpoint-29500}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-29500/trainer_state.json ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 7.833244822092405,
6
+ "eval_steps": 500,
7
+ "global_step": 29500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1327668613913967,
14
+ "grad_norm": 0.633160412311554,
15
+ "learning_rate": 4.917352628783856e-05,
16
+ "loss": 2.9498,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.2655337227827934,
21
+ "grad_norm": 0.4872244596481323,
22
+ "learning_rate": 4.834373340414233e-05,
23
+ "loss": 1.7486,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.3983005841741901,
28
+ "grad_norm": 0.4818345904350281,
29
+ "learning_rate": 4.75139405204461e-05,
30
+ "loss": 1.4326,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.5310674455655868,
35
+ "grad_norm": 0.5155183672904968,
36
+ "learning_rate": 4.668414763674987e-05,
37
+ "loss": 1.2762,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.6638343069569835,
42
+ "grad_norm": 0.4676671326160431,
43
+ "learning_rate": 4.585435475305364e-05,
44
+ "loss": 1.1738,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.7966011683483802,
49
+ "grad_norm": 0.40301135182380676,
50
+ "learning_rate": 4.502456186935741e-05,
51
+ "loss": 1.1018,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.929368029739777,
56
+ "grad_norm": 0.3687935471534729,
57
+ "learning_rate": 4.419476898566118e-05,
58
+ "loss": 1.0399,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 1.0621348911311737,
63
+ "grad_norm": 0.3971472978591919,
64
+ "learning_rate": 4.336497610196495e-05,
65
+ "loss": 1.002,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 1.1949017525225702,
70
+ "grad_norm": 0.3881046175956726,
71
+ "learning_rate": 4.253518321826872e-05,
72
+ "loss": 0.9598,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 1.327668613913967,
77
+ "grad_norm": 0.41512489318847656,
78
+ "learning_rate": 4.170539033457249e-05,
79
+ "loss": 0.9347,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 1.4604354753053639,
84
+ "grad_norm": 0.4095878303050995,
85
+ "learning_rate": 4.0875597450876265e-05,
86
+ "loss": 0.9037,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 1.5932023366967605,
91
+ "grad_norm": 0.42337608337402344,
92
+ "learning_rate": 4.004580456718004e-05,
93
+ "loss": 0.8754,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 1.725969198088157,
98
+ "grad_norm": 0.3757030963897705,
99
+ "learning_rate": 3.92176712692512e-05,
100
+ "loss": 0.8545,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 1.858736059479554,
105
+ "grad_norm": 0.3615691363811493,
106
+ "learning_rate": 3.838787838555496e-05,
107
+ "loss": 0.8403,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 1.9915029208709507,
112
+ "grad_norm": 0.3412405848503113,
113
+ "learning_rate": 3.7558085501858735e-05,
114
+ "loss": 0.8319,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 2.1242697822623473,
119
+ "grad_norm": 0.3392370641231537,
120
+ "learning_rate": 3.67299522039299e-05,
121
+ "loss": 0.8125,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 2.257036643653744,
126
+ "grad_norm": 0.3229221701622009,
127
+ "learning_rate": 3.5900159320233674e-05,
128
+ "loss": 0.7917,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 2.3898035050451405,
133
+ "grad_norm": 0.3485640585422516,
134
+ "learning_rate": 3.5070366436537446e-05,
135
+ "loss": 0.7852,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 2.5225703664365375,
140
+ "grad_norm": 0.32653355598449707,
141
+ "learning_rate": 3.424057355284121e-05,
142
+ "loss": 0.7725,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 2.655337227827934,
147
+ "grad_norm": 0.35505905747413635,
148
+ "learning_rate": 3.3410780669144984e-05,
149
+ "loss": 0.7636,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 2.7881040892193307,
154
+ "grad_norm": 0.36597949266433716,
155
+ "learning_rate": 3.258098778544875e-05,
156
+ "loss": 0.7495,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 2.9208709506107278,
161
+ "grad_norm": 0.3176648020744324,
162
+ "learning_rate": 3.175119490175252e-05,
163
+ "loss": 0.7416,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 3.0536378120021244,
168
+ "grad_norm": 0.5107095241546631,
169
+ "learning_rate": 3.0921402018056293e-05,
170
+ "loss": 0.7288,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 3.186404673393521,
175
+ "grad_norm": 0.3380516767501831,
176
+ "learning_rate": 3.0091609134360066e-05,
177
+ "loss": 0.7249,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 3.3191715347849176,
182
+ "grad_norm": 0.33378171920776367,
183
+ "learning_rate": 2.9261816250663838e-05,
184
+ "loss": 0.7212,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 3.451938396176314,
189
+ "grad_norm": 0.3314702808856964,
190
+ "learning_rate": 2.843202336696761e-05,
191
+ "loss": 0.715,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 3.584705257567711,
196
+ "grad_norm": 0.3204588294029236,
197
+ "learning_rate": 2.7602230483271375e-05,
198
+ "loss": 0.7026,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 3.717472118959108,
203
+ "grad_norm": 0.32662609219551086,
204
+ "learning_rate": 2.677409718534254e-05,
205
+ "loss": 0.6965,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 3.8502389803505044,
210
+ "grad_norm": 0.35398003458976746,
211
+ "learning_rate": 2.594430430164631e-05,
212
+ "loss": 0.6934,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 3.9830058417419014,
217
+ "grad_norm": 0.3326145112514496,
218
+ "learning_rate": 2.5116171003717474e-05,
219
+ "loss": 0.6882,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 4.115772703133298,
224
+ "grad_norm": 0.4153067469596863,
225
+ "learning_rate": 2.4286378120021243e-05,
226
+ "loss": 0.688,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 4.248539564524695,
231
+ "grad_norm": 0.295330673456192,
232
+ "learning_rate": 2.3456585236325015e-05,
233
+ "loss": 0.6789,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 4.381306425916091,
238
+ "grad_norm": 0.2986246943473816,
239
+ "learning_rate": 2.2626792352628784e-05,
240
+ "loss": 0.678,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 4.514073287307488,
245
+ "grad_norm": 0.3353865444660187,
246
+ "learning_rate": 2.1796999468932556e-05,
247
+ "loss": 0.671,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 4.646840148698884,
252
+ "grad_norm": 0.3019355833530426,
253
+ "learning_rate": 2.0967206585236325e-05,
254
+ "loss": 0.6637,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 4.779607010090281,
259
+ "grad_norm": 0.281649649143219,
260
+ "learning_rate": 2.0137413701540097e-05,
261
+ "loss": 0.6557,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 4.9123738714816785,
266
+ "grad_norm": 0.3320733904838562,
267
+ "learning_rate": 1.9307620817843866e-05,
268
+ "loss": 0.6643,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 5.045140732873075,
273
+ "grad_norm": 0.33195579051971436,
274
+ "learning_rate": 1.8479487519915033e-05,
275
+ "loss": 0.6513,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 5.177907594264472,
280
+ "grad_norm": 0.3125695288181305,
281
+ "learning_rate": 1.7649694636218798e-05,
282
+ "loss": 0.6513,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 5.310674455655868,
287
+ "grad_norm": 0.30982694029808044,
288
+ "learning_rate": 1.682156133828996e-05,
289
+ "loss": 0.6511,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 5.443441317047265,
294
+ "grad_norm": 0.307338684797287,
295
+ "learning_rate": 1.5991768454593734e-05,
296
+ "loss": 0.6517,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 5.5762081784386615,
301
+ "grad_norm": 0.29489612579345703,
302
+ "learning_rate": 1.5161975570897504e-05,
303
+ "loss": 0.642,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 5.708975039830058,
308
+ "grad_norm": 0.27674245834350586,
309
+ "learning_rate": 1.4332182687201276e-05,
310
+ "loss": 0.6452,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 5.8417419012214555,
315
+ "grad_norm": 0.3172672688961029,
316
+ "learning_rate": 1.3502389803505045e-05,
317
+ "loss": 0.6436,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 5.974508762612852,
322
+ "grad_norm": 0.3144547641277313,
323
+ "learning_rate": 1.2672596919808816e-05,
324
+ "loss": 0.6385,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 6.107275624004249,
329
+ "grad_norm": 0.3105868399143219,
330
+ "learning_rate": 1.1844463621879979e-05,
331
+ "loss": 0.6329,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 6.240042485395645,
336
+ "grad_norm": 0.3802029490470886,
337
+ "learning_rate": 1.101467073818375e-05,
338
+ "loss": 0.6352,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 6.372809346787042,
343
+ "grad_norm": 0.4196580946445465,
344
+ "learning_rate": 1.018487785448752e-05,
345
+ "loss": 0.634,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 6.5055762081784385,
350
+ "grad_norm": 0.28329288959503174,
351
+ "learning_rate": 9.35508497079129e-06,
352
+ "loss": 0.6339,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 6.638343069569835,
357
+ "grad_norm": 0.28894877433776855,
358
+ "learning_rate": 8.525292087095061e-06,
359
+ "loss": 0.6312,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 6.771109930961232,
364
+ "grad_norm": 0.3208027482032776,
365
+ "learning_rate": 7.697158789166224e-06,
366
+ "loss": 0.6361,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 6.903876792352628,
371
+ "grad_norm": 0.3069681227207184,
372
+ "learning_rate": 6.867365905469996e-06,
373
+ "loss": 0.6305,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 7.036643653744026,
378
+ "grad_norm": 0.35687127709388733,
379
+ "learning_rate": 6.037573021773765e-06,
380
+ "loss": 0.6283,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 7.169410515135422,
385
+ "grad_norm": 0.3692304193973541,
386
+ "learning_rate": 5.207780138077536e-06,
387
+ "loss": 0.6272,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 7.302177376526819,
392
+ "grad_norm": 0.2998880445957184,
393
+ "learning_rate": 4.377987254381306e-06,
394
+ "loss": 0.6257,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 7.434944237918216,
399
+ "grad_norm": 0.35746654868125916,
400
+ "learning_rate": 3.5481943706850767e-06,
401
+ "loss": 0.6233,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 7.567711099309612,
406
+ "grad_norm": 0.27425265312194824,
407
+ "learning_rate": 2.7184014869888476e-06,
408
+ "loss": 0.6299,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 7.700477960701009,
413
+ "grad_norm": 0.5259220004081726,
414
+ "learning_rate": 1.8886086032926183e-06,
415
+ "loss": 0.6211,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 7.833244822092405,
420
+ "grad_norm": 0.38208672404289246,
421
+ "learning_rate": 1.0588157195963888e-06,
422
+ "loss": 0.6283,
423
+ "step": 29500
424
+ }
425
+ ],
426
+ "logging_steps": 500,
427
+ "max_steps": 30128,
428
+ "num_input_tokens_seen": 0,
429
+ "num_train_epochs": 8,
430
+ "save_steps": 500,
431
+ "stateful_callbacks": {
432
+ "TrainerControl": {
433
+ "args": {
434
+ "should_epoch_stop": false,
435
+ "should_evaluate": false,
436
+ "should_log": false,
437
+ "should_save": true,
438
+ "should_training_stop": false
439
+ },
440
+ "attributes": {}
441
+ }
442
+ },
443
+ "total_flos": 6.387422485020672e+16,
444
+ "train_batch_size": 32,
445
+ "trial_name": null,
446
+ "trial_params": null
447
+ }
checkpoints/{checkpoint-48500 → checkpoint-29500}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e94179d735e9ead00b90fad45af99e009779a12dba4e32a7dde92da29b59e62
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c12c6ed93615e8828ca5922138bd2446a795024a161ed3d049b59ff409bebcb
3
  size 5240
checkpoints/{checkpoint-48500 → checkpoint-30000}/config.json RENAMED
File without changes
checkpoints/{checkpoint-48500 → checkpoint-30000}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-48500 → checkpoint-30000}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad7908f1a4a883ec2a05b25bc3adea9d943ff8aca1b6ea1dbfbe7566f2567d29
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c1eeee5dfe80f331f9232aa68046603d58e8f7860cd395d5d8dd3c8eef3b85
3
  size 242041896
checkpoints/{checkpoint-48000 → checkpoint-30000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9fe7c27b785e4ff27d48401dd7f31aae5c64bc7ca605f94862d0563134ecebf
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d752fb89f9d928713e11d43ba26b120b5d58a345948630329de4c46ceff82d47
3
  size 484163514
checkpoints/{checkpoint-48958 → checkpoint-30000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95c3338e3a3d60e44d86ce6eac15796914159dbfffe2407b4e60c3ab111b52e4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cad067dbb89e76f44a290c6d80beda051459c3cb91444edcd0c785deaf550ce
3
  size 14244
checkpoints/{checkpoint-48500 → checkpoint-30000}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59e87ed4c6364d70f1e30a51af92bf3a98a2981d4fdf0ef1f2dd5fd5300af10e
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26864b6290f6c5fdd3e1fd49074ccfdcb21505716f6c2aa7da02ad79f5e3bc11
3
  size 988
checkpoints/{checkpoint-48500 → checkpoint-30000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fdc49e90537dfdf6d385a7dd2e81cb630ecb40b0032ba11685e60e793c47af2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb64ba69ad4cf503098169e20a022b5a978742edf5660dc5350af9646cee3894
3
  size 1064
checkpoints/{checkpoint-48500 → checkpoint-30000}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-48500 → checkpoint-30000}/spiece.model RENAMED
File without changes
checkpoints/{checkpoint-48500 → checkpoint-30000}/tokenizer.json RENAMED
File without changes
checkpoints/{checkpoint-48500 → checkpoint-30000}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-30000/trainer_state.json ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 7.966011683483803,
6
+ "eval_steps": 500,
7
+ "global_step": 30000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1327668613913967,
14
+ "grad_norm": 0.633160412311554,
15
+ "learning_rate": 4.917352628783856e-05,
16
+ "loss": 2.9498,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.2655337227827934,
21
+ "grad_norm": 0.4872244596481323,
22
+ "learning_rate": 4.834373340414233e-05,
23
+ "loss": 1.7486,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.3983005841741901,
28
+ "grad_norm": 0.4818345904350281,
29
+ "learning_rate": 4.75139405204461e-05,
30
+ "loss": 1.4326,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.5310674455655868,
35
+ "grad_norm": 0.5155183672904968,
36
+ "learning_rate": 4.668414763674987e-05,
37
+ "loss": 1.2762,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.6638343069569835,
42
+ "grad_norm": 0.4676671326160431,
43
+ "learning_rate": 4.585435475305364e-05,
44
+ "loss": 1.1738,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.7966011683483802,
49
+ "grad_norm": 0.40301135182380676,
50
+ "learning_rate": 4.502456186935741e-05,
51
+ "loss": 1.1018,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.929368029739777,
56
+ "grad_norm": 0.3687935471534729,
57
+ "learning_rate": 4.419476898566118e-05,
58
+ "loss": 1.0399,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 1.0621348911311737,
63
+ "grad_norm": 0.3971472978591919,
64
+ "learning_rate": 4.336497610196495e-05,
65
+ "loss": 1.002,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 1.1949017525225702,
70
+ "grad_norm": 0.3881046175956726,
71
+ "learning_rate": 4.253518321826872e-05,
72
+ "loss": 0.9598,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 1.327668613913967,
77
+ "grad_norm": 0.41512489318847656,
78
+ "learning_rate": 4.170539033457249e-05,
79
+ "loss": 0.9347,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 1.4604354753053639,
84
+ "grad_norm": 0.4095878303050995,
85
+ "learning_rate": 4.0875597450876265e-05,
86
+ "loss": 0.9037,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 1.5932023366967605,
91
+ "grad_norm": 0.42337608337402344,
92
+ "learning_rate": 4.004580456718004e-05,
93
+ "loss": 0.8754,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 1.725969198088157,
98
+ "grad_norm": 0.3757030963897705,
99
+ "learning_rate": 3.92176712692512e-05,
100
+ "loss": 0.8545,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 1.858736059479554,
105
+ "grad_norm": 0.3615691363811493,
106
+ "learning_rate": 3.838787838555496e-05,
107
+ "loss": 0.8403,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 1.9915029208709507,
112
+ "grad_norm": 0.3412405848503113,
113
+ "learning_rate": 3.7558085501858735e-05,
114
+ "loss": 0.8319,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 2.1242697822623473,
119
+ "grad_norm": 0.3392370641231537,
120
+ "learning_rate": 3.67299522039299e-05,
121
+ "loss": 0.8125,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 2.257036643653744,
126
+ "grad_norm": 0.3229221701622009,
127
+ "learning_rate": 3.5900159320233674e-05,
128
+ "loss": 0.7917,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 2.3898035050451405,
133
+ "grad_norm": 0.3485640585422516,
134
+ "learning_rate": 3.5070366436537446e-05,
135
+ "loss": 0.7852,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 2.5225703664365375,
140
+ "grad_norm": 0.32653355598449707,
141
+ "learning_rate": 3.424057355284121e-05,
142
+ "loss": 0.7725,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 2.655337227827934,
147
+ "grad_norm": 0.35505905747413635,
148
+ "learning_rate": 3.3410780669144984e-05,
149
+ "loss": 0.7636,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 2.7881040892193307,
154
+ "grad_norm": 0.36597949266433716,
155
+ "learning_rate": 3.258098778544875e-05,
156
+ "loss": 0.7495,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 2.9208709506107278,
161
+ "grad_norm": 0.3176648020744324,
162
+ "learning_rate": 3.175119490175252e-05,
163
+ "loss": 0.7416,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 3.0536378120021244,
168
+ "grad_norm": 0.5107095241546631,
169
+ "learning_rate": 3.0921402018056293e-05,
170
+ "loss": 0.7288,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 3.186404673393521,
175
+ "grad_norm": 0.3380516767501831,
176
+ "learning_rate": 3.0091609134360066e-05,
177
+ "loss": 0.7249,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 3.3191715347849176,
182
+ "grad_norm": 0.33378171920776367,
183
+ "learning_rate": 2.9261816250663838e-05,
184
+ "loss": 0.7212,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 3.451938396176314,
189
+ "grad_norm": 0.3314702808856964,
190
+ "learning_rate": 2.843202336696761e-05,
191
+ "loss": 0.715,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 3.584705257567711,
196
+ "grad_norm": 0.3204588294029236,
197
+ "learning_rate": 2.7602230483271375e-05,
198
+ "loss": 0.7026,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 3.717472118959108,
203
+ "grad_norm": 0.32662609219551086,
204
+ "learning_rate": 2.677409718534254e-05,
205
+ "loss": 0.6965,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 3.8502389803505044,
210
+ "grad_norm": 0.35398003458976746,
211
+ "learning_rate": 2.594430430164631e-05,
212
+ "loss": 0.6934,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 3.9830058417419014,
217
+ "grad_norm": 0.3326145112514496,
218
+ "learning_rate": 2.5116171003717474e-05,
219
+ "loss": 0.6882,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 4.115772703133298,
224
+ "grad_norm": 0.4153067469596863,
225
+ "learning_rate": 2.4286378120021243e-05,
226
+ "loss": 0.688,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 4.248539564524695,
231
+ "grad_norm": 0.295330673456192,
232
+ "learning_rate": 2.3456585236325015e-05,
233
+ "loss": 0.6789,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 4.381306425916091,
238
+ "grad_norm": 0.2986246943473816,
239
+ "learning_rate": 2.2626792352628784e-05,
240
+ "loss": 0.678,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 4.514073287307488,
245
+ "grad_norm": 0.3353865444660187,
246
+ "learning_rate": 2.1796999468932556e-05,
247
+ "loss": 0.671,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 4.646840148698884,
252
+ "grad_norm": 0.3019355833530426,
253
+ "learning_rate": 2.0967206585236325e-05,
254
+ "loss": 0.6637,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 4.779607010090281,
259
+ "grad_norm": 0.281649649143219,
260
+ "learning_rate": 2.0137413701540097e-05,
261
+ "loss": 0.6557,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 4.9123738714816785,
266
+ "grad_norm": 0.3320733904838562,
267
+ "learning_rate": 1.9307620817843866e-05,
268
+ "loss": 0.6643,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 5.045140732873075,
273
+ "grad_norm": 0.33195579051971436,
274
+ "learning_rate": 1.8479487519915033e-05,
275
+ "loss": 0.6513,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 5.177907594264472,
280
+ "grad_norm": 0.3125695288181305,
281
+ "learning_rate": 1.7649694636218798e-05,
282
+ "loss": 0.6513,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 5.310674455655868,
287
+ "grad_norm": 0.30982694029808044,
288
+ "learning_rate": 1.682156133828996e-05,
289
+ "loss": 0.6511,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 5.443441317047265,
294
+ "grad_norm": 0.307338684797287,
295
+ "learning_rate": 1.5991768454593734e-05,
296
+ "loss": 0.6517,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 5.5762081784386615,
301
+ "grad_norm": 0.29489612579345703,
302
+ "learning_rate": 1.5161975570897504e-05,
303
+ "loss": 0.642,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 5.708975039830058,
308
+ "grad_norm": 0.27674245834350586,
309
+ "learning_rate": 1.4332182687201276e-05,
310
+ "loss": 0.6452,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 5.8417419012214555,
315
+ "grad_norm": 0.3172672688961029,
316
+ "learning_rate": 1.3502389803505045e-05,
317
+ "loss": 0.6436,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 5.974508762612852,
322
+ "grad_norm": 0.3144547641277313,
323
+ "learning_rate": 1.2672596919808816e-05,
324
+ "loss": 0.6385,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 6.107275624004249,
329
+ "grad_norm": 0.3105868399143219,
330
+ "learning_rate": 1.1844463621879979e-05,
331
+ "loss": 0.6329,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 6.240042485395645,
336
+ "grad_norm": 0.3802029490470886,
337
+ "learning_rate": 1.101467073818375e-05,
338
+ "loss": 0.6352,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 6.372809346787042,
343
+ "grad_norm": 0.4196580946445465,
344
+ "learning_rate": 1.018487785448752e-05,
345
+ "loss": 0.634,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 6.5055762081784385,
350
+ "grad_norm": 0.28329288959503174,
351
+ "learning_rate": 9.35508497079129e-06,
352
+ "loss": 0.6339,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 6.638343069569835,
357
+ "grad_norm": 0.28894877433776855,
358
+ "learning_rate": 8.525292087095061e-06,
359
+ "loss": 0.6312,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 6.771109930961232,
364
+ "grad_norm": 0.3208027482032776,
365
+ "learning_rate": 7.697158789166224e-06,
366
+ "loss": 0.6361,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 6.903876792352628,
371
+ "grad_norm": 0.3069681227207184,
372
+ "learning_rate": 6.867365905469996e-06,
373
+ "loss": 0.6305,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 7.036643653744026,
378
+ "grad_norm": 0.35687127709388733,
379
+ "learning_rate": 6.037573021773765e-06,
380
+ "loss": 0.6283,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 7.169410515135422,
385
+ "grad_norm": 0.3692304193973541,
386
+ "learning_rate": 5.207780138077536e-06,
387
+ "loss": 0.6272,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 7.302177376526819,
392
+ "grad_norm": 0.2998880445957184,
393
+ "learning_rate": 4.377987254381306e-06,
394
+ "loss": 0.6257,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 7.434944237918216,
399
+ "grad_norm": 0.35746654868125916,
400
+ "learning_rate": 3.5481943706850767e-06,
401
+ "loss": 0.6233,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 7.567711099309612,
406
+ "grad_norm": 0.27425265312194824,
407
+ "learning_rate": 2.7184014869888476e-06,
408
+ "loss": 0.6299,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 7.700477960701009,
413
+ "grad_norm": 0.5259220004081726,
414
+ "learning_rate": 1.8886086032926183e-06,
415
+ "loss": 0.6211,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 7.833244822092405,
420
+ "grad_norm": 0.38208672404289246,
421
+ "learning_rate": 1.0588157195963888e-06,
422
+ "loss": 0.6283,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 7.966011683483803,
427
+ "grad_norm": 0.2999090254306793,
428
+ "learning_rate": 2.2902283590015934e-07,
429
+ "loss": 0.6254,
430
+ "step": 30000
431
+ }
432
+ ],
433
+ "logging_steps": 500,
434
+ "max_steps": 30128,
435
+ "num_input_tokens_seen": 0,
436
+ "num_train_epochs": 8,
437
+ "save_steps": 500,
438
+ "stateful_callbacks": {
439
+ "TrainerControl": {
440
+ "args": {
441
+ "should_epoch_stop": false,
442
+ "should_evaluate": false,
443
+ "should_log": false,
444
+ "should_save": true,
445
+ "should_training_stop": false
446
+ },
447
+ "attributes": {}
448
+ }
449
+ },
450
+ "total_flos": 6.495695926198272e+16,
451
+ "train_batch_size": 32,
452
+ "trial_name": null,
453
+ "trial_params": null
454
+ }
checkpoints/{checkpoint-48000 → checkpoint-30000}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e94179d735e9ead00b90fad45af99e009779a12dba4e32a7dde92da29b59e62
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c12c6ed93615e8828ca5922138bd2446a795024a161ed3d049b59ff409bebcb
3
  size 5240
checkpoints/{checkpoint-48958 → checkpoint-30128}/config.json RENAMED
File without changes
checkpoints/{checkpoint-48958 → checkpoint-30128}/generation_config.json RENAMED
File without changes
checkpoints/{checkpoint-48958 → checkpoint-30128}/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8ea98de1cde992e950903fb96553ceb84e46b447461bc9f940922b80e9bc3c6
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c80a0cfc2db55db1dfa355590567e32e3a667c714291cca4e29780a242d11605
3
  size 242041896
checkpoints/{checkpoint-48958 → checkpoint-30128}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b7c5f5d93aa888eb04d22fec7da585a62df18817ce300bb426e1c619ffd5fbd
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ca0b72c9630999f085be077bc2d5ff31a13ddfc0da2bfe0c7a16c99d71a161d
3
  size 484163514
checkpoints/{checkpoint-48000 → checkpoint-30128}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00ce3eb813e88edd424cdc90e44c1583d8456c098e486a0ebabbe770aaf7ec12
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:473717e422ac16c93abfbd9c8cf2a13b9f39a608e9901f99ea816190dda20245
3
  size 14244
checkpoints/{checkpoint-48000 → checkpoint-30128}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca97e2ef7b4a9b2ec6c7f4a1a43c701938b26da462b3f6f5c3deffe10916cd2d
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59a4acfb0da74c479080613978839dd3cbb4608fd2c07e764b4c844401d8dd5f
3
  size 988
checkpoints/{checkpoint-48958 → checkpoint-30128}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8faa6a70188374ad73132930e26b5735fb0a78d3a8b7f1cb788c26dc910c2fd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9bcf427148cef79696303601d2ef7cb4d2ce4a9938dcdd5a913f3f44196236
3
  size 1064
checkpoints/{checkpoint-48958 → checkpoint-30128}/special_tokens_map.json RENAMED
File without changes
checkpoints/{checkpoint-48958 → checkpoint-30128}/spiece.model RENAMED
File without changes
checkpoints/{checkpoint-48958 → checkpoint-30128}/tokenizer.json RENAMED
File without changes
checkpoints/{checkpoint-48958 → checkpoint-30128}/tokenizer_config.json RENAMED
File without changes
checkpoints/checkpoint-30128/trainer_state.json ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 8.0,
6
+ "eval_steps": 500,
7
+ "global_step": 30128,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.1327668613913967,
14
+ "grad_norm": 0.633160412311554,
15
+ "learning_rate": 4.917352628783856e-05,
16
+ "loss": 2.9498,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.2655337227827934,
21
+ "grad_norm": 0.4872244596481323,
22
+ "learning_rate": 4.834373340414233e-05,
23
+ "loss": 1.7486,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.3983005841741901,
28
+ "grad_norm": 0.4818345904350281,
29
+ "learning_rate": 4.75139405204461e-05,
30
+ "loss": 1.4326,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.5310674455655868,
35
+ "grad_norm": 0.5155183672904968,
36
+ "learning_rate": 4.668414763674987e-05,
37
+ "loss": 1.2762,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.6638343069569835,
42
+ "grad_norm": 0.4676671326160431,
43
+ "learning_rate": 4.585435475305364e-05,
44
+ "loss": 1.1738,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.7966011683483802,
49
+ "grad_norm": 0.40301135182380676,
50
+ "learning_rate": 4.502456186935741e-05,
51
+ "loss": 1.1018,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.929368029739777,
56
+ "grad_norm": 0.3687935471534729,
57
+ "learning_rate": 4.419476898566118e-05,
58
+ "loss": 1.0399,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 1.0621348911311737,
63
+ "grad_norm": 0.3971472978591919,
64
+ "learning_rate": 4.336497610196495e-05,
65
+ "loss": 1.002,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 1.1949017525225702,
70
+ "grad_norm": 0.3881046175956726,
71
+ "learning_rate": 4.253518321826872e-05,
72
+ "loss": 0.9598,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 1.327668613913967,
77
+ "grad_norm": 0.41512489318847656,
78
+ "learning_rate": 4.170539033457249e-05,
79
+ "loss": 0.9347,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 1.4604354753053639,
84
+ "grad_norm": 0.4095878303050995,
85
+ "learning_rate": 4.0875597450876265e-05,
86
+ "loss": 0.9037,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 1.5932023366967605,
91
+ "grad_norm": 0.42337608337402344,
92
+ "learning_rate": 4.004580456718004e-05,
93
+ "loss": 0.8754,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 1.725969198088157,
98
+ "grad_norm": 0.3757030963897705,
99
+ "learning_rate": 3.92176712692512e-05,
100
+ "loss": 0.8545,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 1.858736059479554,
105
+ "grad_norm": 0.3615691363811493,
106
+ "learning_rate": 3.838787838555496e-05,
107
+ "loss": 0.8403,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 1.9915029208709507,
112
+ "grad_norm": 0.3412405848503113,
113
+ "learning_rate": 3.7558085501858735e-05,
114
+ "loss": 0.8319,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 2.1242697822623473,
119
+ "grad_norm": 0.3392370641231537,
120
+ "learning_rate": 3.67299522039299e-05,
121
+ "loss": 0.8125,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 2.257036643653744,
126
+ "grad_norm": 0.3229221701622009,
127
+ "learning_rate": 3.5900159320233674e-05,
128
+ "loss": 0.7917,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 2.3898035050451405,
133
+ "grad_norm": 0.3485640585422516,
134
+ "learning_rate": 3.5070366436537446e-05,
135
+ "loss": 0.7852,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 2.5225703664365375,
140
+ "grad_norm": 0.32653355598449707,
141
+ "learning_rate": 3.424057355284121e-05,
142
+ "loss": 0.7725,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 2.655337227827934,
147
+ "grad_norm": 0.35505905747413635,
148
+ "learning_rate": 3.3410780669144984e-05,
149
+ "loss": 0.7636,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 2.7881040892193307,
154
+ "grad_norm": 0.36597949266433716,
155
+ "learning_rate": 3.258098778544875e-05,
156
+ "loss": 0.7495,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 2.9208709506107278,
161
+ "grad_norm": 0.3176648020744324,
162
+ "learning_rate": 3.175119490175252e-05,
163
+ "loss": 0.7416,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 3.0536378120021244,
168
+ "grad_norm": 0.5107095241546631,
169
+ "learning_rate": 3.0921402018056293e-05,
170
+ "loss": 0.7288,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 3.186404673393521,
175
+ "grad_norm": 0.3380516767501831,
176
+ "learning_rate": 3.0091609134360066e-05,
177
+ "loss": 0.7249,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 3.3191715347849176,
182
+ "grad_norm": 0.33378171920776367,
183
+ "learning_rate": 2.9261816250663838e-05,
184
+ "loss": 0.7212,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 3.451938396176314,
189
+ "grad_norm": 0.3314702808856964,
190
+ "learning_rate": 2.843202336696761e-05,
191
+ "loss": 0.715,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 3.584705257567711,
196
+ "grad_norm": 0.3204588294029236,
197
+ "learning_rate": 2.7602230483271375e-05,
198
+ "loss": 0.7026,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 3.717472118959108,
203
+ "grad_norm": 0.32662609219551086,
204
+ "learning_rate": 2.677409718534254e-05,
205
+ "loss": 0.6965,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 3.8502389803505044,
210
+ "grad_norm": 0.35398003458976746,
211
+ "learning_rate": 2.594430430164631e-05,
212
+ "loss": 0.6934,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 3.9830058417419014,
217
+ "grad_norm": 0.3326145112514496,
218
+ "learning_rate": 2.5116171003717474e-05,
219
+ "loss": 0.6882,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 4.115772703133298,
224
+ "grad_norm": 0.4153067469596863,
225
+ "learning_rate": 2.4286378120021243e-05,
226
+ "loss": 0.688,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 4.248539564524695,
231
+ "grad_norm": 0.295330673456192,
232
+ "learning_rate": 2.3456585236325015e-05,
233
+ "loss": 0.6789,
234
+ "step": 16000
235
+ },
236
+ {
237
+ "epoch": 4.381306425916091,
238
+ "grad_norm": 0.2986246943473816,
239
+ "learning_rate": 2.2626792352628784e-05,
240
+ "loss": 0.678,
241
+ "step": 16500
242
+ },
243
+ {
244
+ "epoch": 4.514073287307488,
245
+ "grad_norm": 0.3353865444660187,
246
+ "learning_rate": 2.1796999468932556e-05,
247
+ "loss": 0.671,
248
+ "step": 17000
249
+ },
250
+ {
251
+ "epoch": 4.646840148698884,
252
+ "grad_norm": 0.3019355833530426,
253
+ "learning_rate": 2.0967206585236325e-05,
254
+ "loss": 0.6637,
255
+ "step": 17500
256
+ },
257
+ {
258
+ "epoch": 4.779607010090281,
259
+ "grad_norm": 0.281649649143219,
260
+ "learning_rate": 2.0137413701540097e-05,
261
+ "loss": 0.6557,
262
+ "step": 18000
263
+ },
264
+ {
265
+ "epoch": 4.9123738714816785,
266
+ "grad_norm": 0.3320733904838562,
267
+ "learning_rate": 1.9307620817843866e-05,
268
+ "loss": 0.6643,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 5.045140732873075,
273
+ "grad_norm": 0.33195579051971436,
274
+ "learning_rate": 1.8479487519915033e-05,
275
+ "loss": 0.6513,
276
+ "step": 19000
277
+ },
278
+ {
279
+ "epoch": 5.177907594264472,
280
+ "grad_norm": 0.3125695288181305,
281
+ "learning_rate": 1.7649694636218798e-05,
282
+ "loss": 0.6513,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 5.310674455655868,
287
+ "grad_norm": 0.30982694029808044,
288
+ "learning_rate": 1.682156133828996e-05,
289
+ "loss": 0.6511,
290
+ "step": 20000
291
+ },
292
+ {
293
+ "epoch": 5.443441317047265,
294
+ "grad_norm": 0.307338684797287,
295
+ "learning_rate": 1.5991768454593734e-05,
296
+ "loss": 0.6517,
297
+ "step": 20500
298
+ },
299
+ {
300
+ "epoch": 5.5762081784386615,
301
+ "grad_norm": 0.29489612579345703,
302
+ "learning_rate": 1.5161975570897504e-05,
303
+ "loss": 0.642,
304
+ "step": 21000
305
+ },
306
+ {
307
+ "epoch": 5.708975039830058,
308
+ "grad_norm": 0.27674245834350586,
309
+ "learning_rate": 1.4332182687201276e-05,
310
+ "loss": 0.6452,
311
+ "step": 21500
312
+ },
313
+ {
314
+ "epoch": 5.8417419012214555,
315
+ "grad_norm": 0.3172672688961029,
316
+ "learning_rate": 1.3502389803505045e-05,
317
+ "loss": 0.6436,
318
+ "step": 22000
319
+ },
320
+ {
321
+ "epoch": 5.974508762612852,
322
+ "grad_norm": 0.3144547641277313,
323
+ "learning_rate": 1.2672596919808816e-05,
324
+ "loss": 0.6385,
325
+ "step": 22500
326
+ },
327
+ {
328
+ "epoch": 6.107275624004249,
329
+ "grad_norm": 0.3105868399143219,
330
+ "learning_rate": 1.1844463621879979e-05,
331
+ "loss": 0.6329,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 6.240042485395645,
336
+ "grad_norm": 0.3802029490470886,
337
+ "learning_rate": 1.101467073818375e-05,
338
+ "loss": 0.6352,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 6.372809346787042,
343
+ "grad_norm": 0.4196580946445465,
344
+ "learning_rate": 1.018487785448752e-05,
345
+ "loss": 0.634,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 6.5055762081784385,
350
+ "grad_norm": 0.28329288959503174,
351
+ "learning_rate": 9.35508497079129e-06,
352
+ "loss": 0.6339,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 6.638343069569835,
357
+ "grad_norm": 0.28894877433776855,
358
+ "learning_rate": 8.525292087095061e-06,
359
+ "loss": 0.6312,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 6.771109930961232,
364
+ "grad_norm": 0.3208027482032776,
365
+ "learning_rate": 7.697158789166224e-06,
366
+ "loss": 0.6361,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 6.903876792352628,
371
+ "grad_norm": 0.3069681227207184,
372
+ "learning_rate": 6.867365905469996e-06,
373
+ "loss": 0.6305,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 7.036643653744026,
378
+ "grad_norm": 0.35687127709388733,
379
+ "learning_rate": 6.037573021773765e-06,
380
+ "loss": 0.6283,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 7.169410515135422,
385
+ "grad_norm": 0.3692304193973541,
386
+ "learning_rate": 5.207780138077536e-06,
387
+ "loss": 0.6272,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 7.302177376526819,
392
+ "grad_norm": 0.2998880445957184,
393
+ "learning_rate": 4.377987254381306e-06,
394
+ "loss": 0.6257,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 7.434944237918216,
399
+ "grad_norm": 0.35746654868125916,
400
+ "learning_rate": 3.5481943706850767e-06,
401
+ "loss": 0.6233,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 7.567711099309612,
406
+ "grad_norm": 0.27425265312194824,
407
+ "learning_rate": 2.7184014869888476e-06,
408
+ "loss": 0.6299,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 7.700477960701009,
413
+ "grad_norm": 0.5259220004081726,
414
+ "learning_rate": 1.8886086032926183e-06,
415
+ "loss": 0.6211,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 7.833244822092405,
420
+ "grad_norm": 0.38208672404289246,
421
+ "learning_rate": 1.0588157195963888e-06,
422
+ "loss": 0.6283,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 7.966011683483803,
427
+ "grad_norm": 0.2999090254306793,
428
+ "learning_rate": 2.2902283590015934e-07,
429
+ "loss": 0.6254,
430
+ "step": 30000
431
+ }
432
+ ],
433
+ "logging_steps": 500,
434
+ "max_steps": 30128,
435
+ "num_input_tokens_seen": 0,
436
+ "num_train_epochs": 8,
437
+ "save_steps": 500,
438
+ "stateful_callbacks": {
439
+ "TrainerControl": {
440
+ "args": {
441
+ "should_epoch_stop": false,
442
+ "should_evaluate": false,
443
+ "should_log": false,
444
+ "should_save": true,
445
+ "should_training_stop": true
446
+ },
447
+ "attributes": {}
448
+ }
449
+ },
450
+ "total_flos": 6.523312420788634e+16,
451
+ "train_batch_size": 32,
452
+ "trial_name": null,
453
+ "trial_params": null
454
+ }
checkpoints/{checkpoint-48958 → checkpoint-30128}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e94179d735e9ead00b90fad45af99e009779a12dba4e32a7dde92da29b59e62
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c12c6ed93615e8828ca5922138bd2446a795024a161ed3d049b59ff409bebcb
3
  size 5240
checkpoints/checkpoint-48000/trainer_state.json DELETED
@@ -1,706 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 12.745618693574084,
6
- "eval_steps": 500,
7
- "global_step": 48000,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.1327668613913967,
14
- "grad_norm": 0.9114758372306824,
15
- "learning_rate": 4.9492422076065206e-05,
16
- "loss": 2.9213,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.2655337227827934,
21
- "grad_norm": 0.8418619632720947,
22
- "learning_rate": 4.89817803014829e-05,
23
- "loss": 1.7378,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.3983005841741901,
28
- "grad_norm": 0.709621012210846,
29
- "learning_rate": 4.8471138526900614e-05,
30
- "loss": 1.4102,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.5310674455655868,
35
- "grad_norm": 0.4253033995628357,
36
- "learning_rate": 4.796049675231832e-05,
37
- "loss": 1.247,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.6638343069569835,
42
- "grad_norm": 0.4772707521915436,
43
- "learning_rate": 4.744985497773602e-05,
44
- "loss": 1.1408,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.7966011683483802,
49
- "grad_norm": 0.43140164017677307,
50
- "learning_rate": 4.6939213203153725e-05,
51
- "loss": 1.0649,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.929368029739777,
56
- "grad_norm": 0.39506375789642334,
57
- "learning_rate": 4.642857142857143e-05,
58
- "loss": 1.0004,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 1.0621348911311737,
63
- "grad_norm": 0.3946131765842438,
64
- "learning_rate": 4.591792965398913e-05,
65
- "loss": 0.9611,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 1.1949017525225702,
70
- "grad_norm": 0.3581591546535492,
71
- "learning_rate": 4.540728787940684e-05,
72
- "loss": 0.917,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 1.327668613913967,
77
- "grad_norm": 0.42795246839523315,
78
- "learning_rate": 4.489664610482455e-05,
79
- "loss": 0.8907,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 1.4604354753053639,
84
- "grad_norm": 0.4635187089443207,
85
- "learning_rate": 4.4386004330242245e-05,
86
- "loss": 0.8598,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 1.5932023366967605,
91
- "grad_norm": 0.3534747064113617,
92
- "learning_rate": 4.3875362555659955e-05,
93
- "loss": 0.8295,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 1.725969198088157,
98
- "grad_norm": 0.39130711555480957,
99
- "learning_rate": 4.336472078107766e-05,
100
- "loss": 0.808,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 1.858736059479554,
105
- "grad_norm": 0.440703809261322,
106
- "learning_rate": 4.285407900649537e-05,
107
- "loss": 0.7927,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 1.9915029208709507,
112
- "grad_norm": 0.3961372375488281,
113
- "learning_rate": 4.234343723191307e-05,
114
- "loss": 0.7828,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 2.1242697822623473,
119
- "grad_norm": 0.3364185690879822,
120
- "learning_rate": 4.1833816740879935e-05,
121
- "loss": 0.7628,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 2.257036643653744,
126
- "grad_norm": 0.34151241183280945,
127
- "learning_rate": 4.1323174966297646e-05,
128
- "loss": 0.7424,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 2.3898035050451405,
133
- "grad_norm": 0.3292118310928345,
134
- "learning_rate": 4.081253319171535e-05,
135
- "loss": 0.7341,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 2.5225703664365375,
140
- "grad_norm": 0.33259105682373047,
141
- "learning_rate": 4.0301891417133054e-05,
142
- "loss": 0.7212,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 2.655337227827934,
147
- "grad_norm": 0.35891205072402954,
148
- "learning_rate": 3.979124964255076e-05,
149
- "loss": 0.712,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 2.7881040892193307,
154
- "grad_norm": 0.3436354398727417,
155
- "learning_rate": 3.928060786796847e-05,
156
- "loss": 0.6975,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 2.9208709506107278,
161
- "grad_norm": 0.3285069465637207,
162
- "learning_rate": 3.8769966093386165e-05,
163
- "loss": 0.6881,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 3.0536378120021244,
168
- "grad_norm": 0.44189152121543884,
169
- "learning_rate": 3.826034560235304e-05,
170
- "loss": 0.6749,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 3.186404673393521,
175
- "grad_norm": 0.34346967935562134,
176
- "learning_rate": 3.7749703827770744e-05,
177
- "loss": 0.6704,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 3.3191715347849176,
182
- "grad_norm": 0.29128846526145935,
183
- "learning_rate": 3.723906205318845e-05,
184
- "loss": 0.6652,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 3.451938396176314,
189
- "grad_norm": 0.3013540208339691,
190
- "learning_rate": 3.672842027860615e-05,
191
- "loss": 0.6588,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 3.584705257567711,
196
- "grad_norm": 0.32138076424598694,
197
- "learning_rate": 3.6217778504023856e-05,
198
- "loss": 0.6453,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 3.717472118959108,
203
- "grad_norm": 0.3408374786376953,
204
- "learning_rate": 3.5707136729441566e-05,
205
- "loss": 0.6388,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 3.8502389803505044,
210
- "grad_norm": 0.9397606253623962,
211
- "learning_rate": 3.519649495485927e-05,
212
- "loss": 0.6349,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 3.9830058417419014,
217
- "grad_norm": 0.3192440867424011,
218
- "learning_rate": 3.4685853180276974e-05,
219
- "loss": 0.6291,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 4.115772703133298,
224
- "grad_norm": 0.3549179136753082,
225
- "learning_rate": 3.417521140569468e-05,
226
- "loss": 0.6278,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 4.248539564524695,
231
- "grad_norm": 0.3110153079032898,
232
- "learning_rate": 3.366456963111239e-05,
233
- "loss": 0.618,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 4.381306425916091,
238
- "grad_norm": 0.2719564735889435,
239
- "learning_rate": 3.3153927856530086e-05,
240
- "loss": 0.6169,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 4.514073287307488,
245
- "grad_norm": 0.2858710289001465,
246
- "learning_rate": 3.2643286081947796e-05,
247
- "loss": 0.61,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 4.646840148698884,
252
- "grad_norm": 0.31373563408851624,
253
- "learning_rate": 3.21326443073655e-05,
254
- "loss": 0.6011,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 4.779607010090281,
259
- "grad_norm": 0.29438045620918274,
260
- "learning_rate": 3.1622002532783204e-05,
261
- "loss": 0.5938,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 4.9123738714816785,
266
- "grad_norm": 0.3415851593017578,
267
- "learning_rate": 3.111238204175007e-05,
268
- "loss": 0.5992,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 5.045140732873075,
273
- "grad_norm": 0.35383546352386475,
274
- "learning_rate": 3.060276155071694e-05,
275
- "loss": 0.5871,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 5.177907594264472,
280
- "grad_norm": 0.3242381811141968,
281
- "learning_rate": 3.009314105968381e-05,
282
- "loss": 0.5867,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 5.310674455655868,
287
- "grad_norm": 0.28274649381637573,
288
- "learning_rate": 2.9582499285101516e-05,
289
- "loss": 0.584,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 5.443441317047265,
294
- "grad_norm": 0.3075231611728668,
295
- "learning_rate": 2.9071857510519223e-05,
296
- "loss": 0.584,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 5.5762081784386615,
301
- "grad_norm": 0.29568806290626526,
302
- "learning_rate": 2.8561215735936924e-05,
303
- "loss": 0.5743,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 5.708975039830058,
308
- "grad_norm": 0.32808518409729004,
309
- "learning_rate": 2.805057396135463e-05,
310
- "loss": 0.5757,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 5.8417419012214555,
315
- "grad_norm": 0.256596177816391,
316
- "learning_rate": 2.7539932186772338e-05,
317
- "loss": 0.5735,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 5.974508762612852,
322
- "grad_norm": 0.313557505607605,
323
- "learning_rate": 2.702929041219004e-05,
324
- "loss": 0.5679,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 6.107275624004249,
329
- "grad_norm": 0.274058997631073,
330
- "learning_rate": 2.6518648637607746e-05,
331
- "loss": 0.562,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 6.240042485395645,
336
- "grad_norm": 0.2777511477470398,
337
- "learning_rate": 2.6008006863025453e-05,
338
- "loss": 0.5619,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 6.372809346787042,
343
- "grad_norm": 0.3301125466823578,
344
- "learning_rate": 2.549736508844316e-05,
345
- "loss": 0.5598,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 6.5055762081784385,
350
- "grad_norm": 0.2844313383102417,
351
- "learning_rate": 2.498672331386086e-05,
352
- "loss": 0.5589,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 6.638343069569835,
357
- "grad_norm": 0.268718421459198,
358
- "learning_rate": 2.4476081539278568e-05,
359
- "loss": 0.5566,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 6.771109930961232,
364
- "grad_norm": 0.3230023980140686,
365
- "learning_rate": 2.3965439764696272e-05,
366
- "loss": 0.5582,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 6.903876792352628,
371
- "grad_norm": 0.27747681736946106,
372
- "learning_rate": 2.3454797990113976e-05,
373
- "loss": 0.5527,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 7.036643653744026,
378
- "grad_norm": 0.29863470792770386,
379
- "learning_rate": 2.2945177499080848e-05,
380
- "loss": 0.5491,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 7.169410515135422,
385
- "grad_norm": 0.30289873480796814,
386
- "learning_rate": 2.243453572449855e-05,
387
- "loss": 0.5468,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 7.302177376526819,
392
- "grad_norm": 0.2766277492046356,
393
- "learning_rate": 2.192491523346542e-05,
394
- "loss": 0.5444,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 7.434944237918216,
399
- "grad_norm": 0.3069545030593872,
400
- "learning_rate": 2.1414273458883124e-05,
401
- "loss": 0.5403,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 7.567711099309612,
406
- "grad_norm": 0.258329302072525,
407
- "learning_rate": 2.090363168430083e-05,
408
- "loss": 0.5453,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 7.700477960701009,
413
- "grad_norm": 0.2901703119277954,
414
- "learning_rate": 2.0392989909718535e-05,
415
- "loss": 0.5357,
416
- "step": 29000
417
- },
418
- {
419
- "epoch": 7.833244822092405,
420
- "grad_norm": 0.35300034284591675,
421
- "learning_rate": 1.988234813513624e-05,
422
- "loss": 0.541,
423
- "step": 29500
424
- },
425
- {
426
- "epoch": 7.966011683483803,
427
- "grad_norm": 0.2620261311531067,
428
- "learning_rate": 1.9371706360553946e-05,
429
- "loss": 0.5371,
430
- "step": 30000
431
- },
432
- {
433
- "epoch": 8.098778544875199,
434
- "grad_norm": 0.3098488450050354,
435
- "learning_rate": 1.886106458597165e-05,
436
- "loss": 0.5337,
437
- "step": 30500
438
- },
439
- {
440
- "epoch": 8.231545406266596,
441
- "grad_norm": 0.2904013991355896,
442
- "learning_rate": 1.8350422811389357e-05,
443
- "loss": 0.5342,
444
- "step": 31000
445
- },
446
- {
447
- "epoch": 8.364312267657992,
448
- "grad_norm": 0.29218047857284546,
449
- "learning_rate": 1.783978103680706e-05,
450
- "loss": 0.5323,
451
- "step": 31500
452
- },
453
- {
454
- "epoch": 8.49707912904939,
455
- "grad_norm": 0.3310258090496063,
456
- "learning_rate": 1.7329139262224765e-05,
457
- "loss": 0.5258,
458
- "step": 32000
459
- },
460
- {
461
- "epoch": 8.629845990440787,
462
- "grad_norm": 0.3069627583026886,
463
- "learning_rate": 1.6818497487642472e-05,
464
- "loss": 0.5299,
465
- "step": 32500
466
- },
467
- {
468
- "epoch": 8.762612851832182,
469
- "grad_norm": 0.24625258147716522,
470
- "learning_rate": 1.630887699660934e-05,
471
- "loss": 0.5285,
472
- "step": 33000
473
- },
474
- {
475
- "epoch": 8.89537971322358,
476
- "grad_norm": 0.26636838912963867,
477
- "learning_rate": 1.5798235222027044e-05,
478
- "loss": 0.5294,
479
- "step": 33500
480
- },
481
- {
482
- "epoch": 9.028146574614976,
483
- "grad_norm": 0.2842467725276947,
484
- "learning_rate": 1.5287593447444748e-05,
485
- "loss": 0.5235,
486
- "step": 34000
487
- },
488
- {
489
- "epoch": 9.160913436006373,
490
- "grad_norm": 0.3261110782623291,
491
- "learning_rate": 1.4776951672862455e-05,
492
- "loss": 0.5256,
493
- "step": 34500
494
- },
495
- {
496
- "epoch": 9.293680297397769,
497
- "grad_norm": 0.2750456929206848,
498
- "learning_rate": 1.4266309898280159e-05,
499
- "loss": 0.5218,
500
- "step": 35000
501
- },
502
- {
503
- "epoch": 9.426447158789166,
504
- "grad_norm": 0.26470229029655457,
505
- "learning_rate": 1.3755668123697864e-05,
506
- "loss": 0.522,
507
- "step": 35500
508
- },
509
- {
510
- "epoch": 9.559214020180562,
511
- "grad_norm": 0.24200379848480225,
512
- "learning_rate": 1.3245026349115568e-05,
513
- "loss": 0.5222,
514
- "step": 36000
515
- },
516
- {
517
- "epoch": 9.69198088157196,
518
- "grad_norm": 0.30407610535621643,
519
- "learning_rate": 1.2734384574533272e-05,
520
- "loss": 0.5208,
521
- "step": 36500
522
- },
523
- {
524
- "epoch": 9.824747742963357,
525
- "grad_norm": 0.26741334795951843,
526
- "learning_rate": 1.2224764083500144e-05,
527
- "loss": 0.5185,
528
- "step": 37000
529
- },
530
- {
531
- "epoch": 9.957514604354753,
532
- "grad_norm": 0.2811224162578583,
533
- "learning_rate": 1.1714122308917848e-05,
534
- "loss": 0.515,
535
- "step": 37500
536
- },
537
- {
538
- "epoch": 10.09028146574615,
539
- "grad_norm": 0.2725277543067932,
540
- "learning_rate": 1.1204501817884718e-05,
541
- "loss": 0.517,
542
- "step": 38000
543
- },
544
- {
545
- "epoch": 10.223048327137546,
546
- "grad_norm": 0.31137147545814514,
547
- "learning_rate": 1.0693860043302423e-05,
548
- "loss": 0.5155,
549
- "step": 38500
550
- },
551
- {
552
- "epoch": 10.355815188528943,
553
- "grad_norm": 0.26093247532844543,
554
- "learning_rate": 1.0183218268720129e-05,
555
- "loss": 0.5148,
556
- "step": 39000
557
- },
558
- {
559
- "epoch": 10.488582049920339,
560
- "grad_norm": 0.2848931550979614,
561
- "learning_rate": 9.672576494137833e-06,
562
- "loss": 0.5134,
563
- "step": 39500
564
- },
565
- {
566
- "epoch": 10.621348911311737,
567
- "grad_norm": 0.24945715069770813,
568
- "learning_rate": 9.161934719555536e-06,
569
- "loss": 0.5136,
570
- "step": 40000
571
- },
572
- {
573
- "epoch": 10.754115772703134,
574
- "grad_norm": 0.28524720668792725,
575
- "learning_rate": 8.651292944973242e-06,
576
- "loss": 0.5167,
577
- "step": 40500
578
- },
579
- {
580
- "epoch": 10.88688263409453,
581
- "grad_norm": 0.29454296827316284,
582
- "learning_rate": 8.140651170390948e-06,
583
- "loss": 0.5151,
584
- "step": 41000
585
- },
586
- {
587
- "epoch": 11.019649495485927,
588
- "grad_norm": 0.30919119715690613,
589
- "learning_rate": 7.632051962906982e-06,
590
- "loss": 0.5121,
591
- "step": 41500
592
- },
593
- {
594
- "epoch": 11.152416356877323,
595
- "grad_norm": 0.36948204040527344,
596
- "learning_rate": 7.121410188324687e-06,
597
- "loss": 0.5146,
598
- "step": 42000
599
- },
600
- {
601
- "epoch": 11.28518321826872,
602
- "grad_norm": 0.2883196771144867,
603
- "learning_rate": 6.610768413742392e-06,
604
- "loss": 0.5118,
605
- "step": 42500
606
- },
607
- {
608
- "epoch": 11.417950079660116,
609
- "grad_norm": 0.2851753532886505,
610
- "learning_rate": 6.100126639160097e-06,
611
- "loss": 0.5092,
612
- "step": 43000
613
- },
614
- {
615
- "epoch": 11.550716941051514,
616
- "grad_norm": 0.27395716309547424,
617
- "learning_rate": 5.5894848645778016e-06,
618
- "loss": 0.5044,
619
- "step": 43500
620
- },
621
- {
622
- "epoch": 11.683483802442911,
623
- "grad_norm": 0.2726575434207916,
624
- "learning_rate": 5.078843089995506e-06,
625
- "loss": 0.5106,
626
- "step": 44000
627
- },
628
- {
629
- "epoch": 11.816250663834307,
630
- "grad_norm": 0.29727038741111755,
631
- "learning_rate": 4.568201315413211e-06,
632
- "loss": 0.5095,
633
- "step": 44500
634
- },
635
- {
636
- "epoch": 11.949017525225704,
637
- "grad_norm": 0.2694978713989258,
638
- "learning_rate": 4.0575595408309166e-06,
639
- "loss": 0.5118,
640
- "step": 45000
641
- },
642
- {
643
- "epoch": 12.0817843866171,
644
- "grad_norm": 0.2318025678396225,
645
- "learning_rate": 3.5469177662486213e-06,
646
- "loss": 0.5126,
647
- "step": 45500
648
- },
649
- {
650
- "epoch": 12.214551248008497,
651
- "grad_norm": 0.27759501338005066,
652
- "learning_rate": 3.0362759916663264e-06,
653
- "loss": 0.5081,
654
- "step": 46000
655
- },
656
- {
657
- "epoch": 12.347318109399893,
658
- "grad_norm": 0.2869941294193268,
659
- "learning_rate": 2.525634217084031e-06,
660
- "loss": 0.5046,
661
- "step": 46500
662
- },
663
- {
664
- "epoch": 12.48008497079129,
665
- "grad_norm": 0.32994431257247925,
666
- "learning_rate": 2.0149924425017362e-06,
667
- "loss": 0.5104,
668
- "step": 47000
669
- },
670
- {
671
- "epoch": 12.612851832182688,
672
- "grad_norm": 0.28273916244506836,
673
- "learning_rate": 1.5053719514686058e-06,
674
- "loss": 0.5036,
675
- "step": 47500
676
- },
677
- {
678
- "epoch": 12.745618693574084,
679
- "grad_norm": 0.2604888379573822,
680
- "learning_rate": 9.947301768863107e-07,
681
- "loss": 0.5086,
682
- "step": 48000
683
- }
684
- ],
685
- "logging_steps": 500,
686
- "max_steps": 48958,
687
- "num_input_tokens_seen": 0,
688
- "num_train_epochs": 13,
689
- "save_steps": 500,
690
- "stateful_callbacks": {
691
- "TrainerControl": {
692
- "args": {
693
- "should_epoch_stop": false,
694
- "should_evaluate": false,
695
- "should_log": false,
696
- "should_save": true,
697
- "should_training_stop": false
698
- },
699
- "attributes": {}
700
- }
701
- },
702
- "total_flos": 1.0393032276836352e+17,
703
- "train_batch_size": 32,
704
- "trial_name": null,
705
- "trial_params": null
706
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-48500/trainer_state.json DELETED
@@ -1,713 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 12.878385554965481,
6
- "eval_steps": 500,
7
- "global_step": 48500,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.1327668613913967,
14
- "grad_norm": 0.9114758372306824,
15
- "learning_rate": 4.9492422076065206e-05,
16
- "loss": 2.9213,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.2655337227827934,
21
- "grad_norm": 0.8418619632720947,
22
- "learning_rate": 4.89817803014829e-05,
23
- "loss": 1.7378,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.3983005841741901,
28
- "grad_norm": 0.709621012210846,
29
- "learning_rate": 4.8471138526900614e-05,
30
- "loss": 1.4102,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.5310674455655868,
35
- "grad_norm": 0.4253033995628357,
36
- "learning_rate": 4.796049675231832e-05,
37
- "loss": 1.247,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.6638343069569835,
42
- "grad_norm": 0.4772707521915436,
43
- "learning_rate": 4.744985497773602e-05,
44
- "loss": 1.1408,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.7966011683483802,
49
- "grad_norm": 0.43140164017677307,
50
- "learning_rate": 4.6939213203153725e-05,
51
- "loss": 1.0649,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.929368029739777,
56
- "grad_norm": 0.39506375789642334,
57
- "learning_rate": 4.642857142857143e-05,
58
- "loss": 1.0004,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 1.0621348911311737,
63
- "grad_norm": 0.3946131765842438,
64
- "learning_rate": 4.591792965398913e-05,
65
- "loss": 0.9611,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 1.1949017525225702,
70
- "grad_norm": 0.3581591546535492,
71
- "learning_rate": 4.540728787940684e-05,
72
- "loss": 0.917,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 1.327668613913967,
77
- "grad_norm": 0.42795246839523315,
78
- "learning_rate": 4.489664610482455e-05,
79
- "loss": 0.8907,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 1.4604354753053639,
84
- "grad_norm": 0.4635187089443207,
85
- "learning_rate": 4.4386004330242245e-05,
86
- "loss": 0.8598,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 1.5932023366967605,
91
- "grad_norm": 0.3534747064113617,
92
- "learning_rate": 4.3875362555659955e-05,
93
- "loss": 0.8295,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 1.725969198088157,
98
- "grad_norm": 0.39130711555480957,
99
- "learning_rate": 4.336472078107766e-05,
100
- "loss": 0.808,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 1.858736059479554,
105
- "grad_norm": 0.440703809261322,
106
- "learning_rate": 4.285407900649537e-05,
107
- "loss": 0.7927,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 1.9915029208709507,
112
- "grad_norm": 0.3961372375488281,
113
- "learning_rate": 4.234343723191307e-05,
114
- "loss": 0.7828,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 2.1242697822623473,
119
- "grad_norm": 0.3364185690879822,
120
- "learning_rate": 4.1833816740879935e-05,
121
- "loss": 0.7628,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 2.257036643653744,
126
- "grad_norm": 0.34151241183280945,
127
- "learning_rate": 4.1323174966297646e-05,
128
- "loss": 0.7424,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 2.3898035050451405,
133
- "grad_norm": 0.3292118310928345,
134
- "learning_rate": 4.081253319171535e-05,
135
- "loss": 0.7341,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 2.5225703664365375,
140
- "grad_norm": 0.33259105682373047,
141
- "learning_rate": 4.0301891417133054e-05,
142
- "loss": 0.7212,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 2.655337227827934,
147
- "grad_norm": 0.35891205072402954,
148
- "learning_rate": 3.979124964255076e-05,
149
- "loss": 0.712,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 2.7881040892193307,
154
- "grad_norm": 0.3436354398727417,
155
- "learning_rate": 3.928060786796847e-05,
156
- "loss": 0.6975,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 2.9208709506107278,
161
- "grad_norm": 0.3285069465637207,
162
- "learning_rate": 3.8769966093386165e-05,
163
- "loss": 0.6881,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 3.0536378120021244,
168
- "grad_norm": 0.44189152121543884,
169
- "learning_rate": 3.826034560235304e-05,
170
- "loss": 0.6749,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 3.186404673393521,
175
- "grad_norm": 0.34346967935562134,
176
- "learning_rate": 3.7749703827770744e-05,
177
- "loss": 0.6704,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 3.3191715347849176,
182
- "grad_norm": 0.29128846526145935,
183
- "learning_rate": 3.723906205318845e-05,
184
- "loss": 0.6652,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 3.451938396176314,
189
- "grad_norm": 0.3013540208339691,
190
- "learning_rate": 3.672842027860615e-05,
191
- "loss": 0.6588,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 3.584705257567711,
196
- "grad_norm": 0.32138076424598694,
197
- "learning_rate": 3.6217778504023856e-05,
198
- "loss": 0.6453,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 3.717472118959108,
203
- "grad_norm": 0.3408374786376953,
204
- "learning_rate": 3.5707136729441566e-05,
205
- "loss": 0.6388,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 3.8502389803505044,
210
- "grad_norm": 0.9397606253623962,
211
- "learning_rate": 3.519649495485927e-05,
212
- "loss": 0.6349,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 3.9830058417419014,
217
- "grad_norm": 0.3192440867424011,
218
- "learning_rate": 3.4685853180276974e-05,
219
- "loss": 0.6291,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 4.115772703133298,
224
- "grad_norm": 0.3549179136753082,
225
- "learning_rate": 3.417521140569468e-05,
226
- "loss": 0.6278,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 4.248539564524695,
231
- "grad_norm": 0.3110153079032898,
232
- "learning_rate": 3.366456963111239e-05,
233
- "loss": 0.618,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 4.381306425916091,
238
- "grad_norm": 0.2719564735889435,
239
- "learning_rate": 3.3153927856530086e-05,
240
- "loss": 0.6169,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 4.514073287307488,
245
- "grad_norm": 0.2858710289001465,
246
- "learning_rate": 3.2643286081947796e-05,
247
- "loss": 0.61,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 4.646840148698884,
252
- "grad_norm": 0.31373563408851624,
253
- "learning_rate": 3.21326443073655e-05,
254
- "loss": 0.6011,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 4.779607010090281,
259
- "grad_norm": 0.29438045620918274,
260
- "learning_rate": 3.1622002532783204e-05,
261
- "loss": 0.5938,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 4.9123738714816785,
266
- "grad_norm": 0.3415851593017578,
267
- "learning_rate": 3.111238204175007e-05,
268
- "loss": 0.5992,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 5.045140732873075,
273
- "grad_norm": 0.35383546352386475,
274
- "learning_rate": 3.060276155071694e-05,
275
- "loss": 0.5871,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 5.177907594264472,
280
- "grad_norm": 0.3242381811141968,
281
- "learning_rate": 3.009314105968381e-05,
282
- "loss": 0.5867,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 5.310674455655868,
287
- "grad_norm": 0.28274649381637573,
288
- "learning_rate": 2.9582499285101516e-05,
289
- "loss": 0.584,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 5.443441317047265,
294
- "grad_norm": 0.3075231611728668,
295
- "learning_rate": 2.9071857510519223e-05,
296
- "loss": 0.584,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 5.5762081784386615,
301
- "grad_norm": 0.29568806290626526,
302
- "learning_rate": 2.8561215735936924e-05,
303
- "loss": 0.5743,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 5.708975039830058,
308
- "grad_norm": 0.32808518409729004,
309
- "learning_rate": 2.805057396135463e-05,
310
- "loss": 0.5757,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 5.8417419012214555,
315
- "grad_norm": 0.256596177816391,
316
- "learning_rate": 2.7539932186772338e-05,
317
- "loss": 0.5735,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 5.974508762612852,
322
- "grad_norm": 0.313557505607605,
323
- "learning_rate": 2.702929041219004e-05,
324
- "loss": 0.5679,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 6.107275624004249,
329
- "grad_norm": 0.274058997631073,
330
- "learning_rate": 2.6518648637607746e-05,
331
- "loss": 0.562,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 6.240042485395645,
336
- "grad_norm": 0.2777511477470398,
337
- "learning_rate": 2.6008006863025453e-05,
338
- "loss": 0.5619,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 6.372809346787042,
343
- "grad_norm": 0.3301125466823578,
344
- "learning_rate": 2.549736508844316e-05,
345
- "loss": 0.5598,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 6.5055762081784385,
350
- "grad_norm": 0.2844313383102417,
351
- "learning_rate": 2.498672331386086e-05,
352
- "loss": 0.5589,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 6.638343069569835,
357
- "grad_norm": 0.268718421459198,
358
- "learning_rate": 2.4476081539278568e-05,
359
- "loss": 0.5566,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 6.771109930961232,
364
- "grad_norm": 0.3230023980140686,
365
- "learning_rate": 2.3965439764696272e-05,
366
- "loss": 0.5582,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 6.903876792352628,
371
- "grad_norm": 0.27747681736946106,
372
- "learning_rate": 2.3454797990113976e-05,
373
- "loss": 0.5527,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 7.036643653744026,
378
- "grad_norm": 0.29863470792770386,
379
- "learning_rate": 2.2945177499080848e-05,
380
- "loss": 0.5491,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 7.169410515135422,
385
- "grad_norm": 0.30289873480796814,
386
- "learning_rate": 2.243453572449855e-05,
387
- "loss": 0.5468,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 7.302177376526819,
392
- "grad_norm": 0.2766277492046356,
393
- "learning_rate": 2.192491523346542e-05,
394
- "loss": 0.5444,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 7.434944237918216,
399
- "grad_norm": 0.3069545030593872,
400
- "learning_rate": 2.1414273458883124e-05,
401
- "loss": 0.5403,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 7.567711099309612,
406
- "grad_norm": 0.258329302072525,
407
- "learning_rate": 2.090363168430083e-05,
408
- "loss": 0.5453,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 7.700477960701009,
413
- "grad_norm": 0.2901703119277954,
414
- "learning_rate": 2.0392989909718535e-05,
415
- "loss": 0.5357,
416
- "step": 29000
417
- },
418
- {
419
- "epoch": 7.833244822092405,
420
- "grad_norm": 0.35300034284591675,
421
- "learning_rate": 1.988234813513624e-05,
422
- "loss": 0.541,
423
- "step": 29500
424
- },
425
- {
426
- "epoch": 7.966011683483803,
427
- "grad_norm": 0.2620261311531067,
428
- "learning_rate": 1.9371706360553946e-05,
429
- "loss": 0.5371,
430
- "step": 30000
431
- },
432
- {
433
- "epoch": 8.098778544875199,
434
- "grad_norm": 0.3098488450050354,
435
- "learning_rate": 1.886106458597165e-05,
436
- "loss": 0.5337,
437
- "step": 30500
438
- },
439
- {
440
- "epoch": 8.231545406266596,
441
- "grad_norm": 0.2904013991355896,
442
- "learning_rate": 1.8350422811389357e-05,
443
- "loss": 0.5342,
444
- "step": 31000
445
- },
446
- {
447
- "epoch": 8.364312267657992,
448
- "grad_norm": 0.29218047857284546,
449
- "learning_rate": 1.783978103680706e-05,
450
- "loss": 0.5323,
451
- "step": 31500
452
- },
453
- {
454
- "epoch": 8.49707912904939,
455
- "grad_norm": 0.3310258090496063,
456
- "learning_rate": 1.7329139262224765e-05,
457
- "loss": 0.5258,
458
- "step": 32000
459
- },
460
- {
461
- "epoch": 8.629845990440787,
462
- "grad_norm": 0.3069627583026886,
463
- "learning_rate": 1.6818497487642472e-05,
464
- "loss": 0.5299,
465
- "step": 32500
466
- },
467
- {
468
- "epoch": 8.762612851832182,
469
- "grad_norm": 0.24625258147716522,
470
- "learning_rate": 1.630887699660934e-05,
471
- "loss": 0.5285,
472
- "step": 33000
473
- },
474
- {
475
- "epoch": 8.89537971322358,
476
- "grad_norm": 0.26636838912963867,
477
- "learning_rate": 1.5798235222027044e-05,
478
- "loss": 0.5294,
479
- "step": 33500
480
- },
481
- {
482
- "epoch": 9.028146574614976,
483
- "grad_norm": 0.2842467725276947,
484
- "learning_rate": 1.5287593447444748e-05,
485
- "loss": 0.5235,
486
- "step": 34000
487
- },
488
- {
489
- "epoch": 9.160913436006373,
490
- "grad_norm": 0.3261110782623291,
491
- "learning_rate": 1.4776951672862455e-05,
492
- "loss": 0.5256,
493
- "step": 34500
494
- },
495
- {
496
- "epoch": 9.293680297397769,
497
- "grad_norm": 0.2750456929206848,
498
- "learning_rate": 1.4266309898280159e-05,
499
- "loss": 0.5218,
500
- "step": 35000
501
- },
502
- {
503
- "epoch": 9.426447158789166,
504
- "grad_norm": 0.26470229029655457,
505
- "learning_rate": 1.3755668123697864e-05,
506
- "loss": 0.522,
507
- "step": 35500
508
- },
509
- {
510
- "epoch": 9.559214020180562,
511
- "grad_norm": 0.24200379848480225,
512
- "learning_rate": 1.3245026349115568e-05,
513
- "loss": 0.5222,
514
- "step": 36000
515
- },
516
- {
517
- "epoch": 9.69198088157196,
518
- "grad_norm": 0.30407610535621643,
519
- "learning_rate": 1.2734384574533272e-05,
520
- "loss": 0.5208,
521
- "step": 36500
522
- },
523
- {
524
- "epoch": 9.824747742963357,
525
- "grad_norm": 0.26741334795951843,
526
- "learning_rate": 1.2224764083500144e-05,
527
- "loss": 0.5185,
528
- "step": 37000
529
- },
530
- {
531
- "epoch": 9.957514604354753,
532
- "grad_norm": 0.2811224162578583,
533
- "learning_rate": 1.1714122308917848e-05,
534
- "loss": 0.515,
535
- "step": 37500
536
- },
537
- {
538
- "epoch": 10.09028146574615,
539
- "grad_norm": 0.2725277543067932,
540
- "learning_rate": 1.1204501817884718e-05,
541
- "loss": 0.517,
542
- "step": 38000
543
- },
544
- {
545
- "epoch": 10.223048327137546,
546
- "grad_norm": 0.31137147545814514,
547
- "learning_rate": 1.0693860043302423e-05,
548
- "loss": 0.5155,
549
- "step": 38500
550
- },
551
- {
552
- "epoch": 10.355815188528943,
553
- "grad_norm": 0.26093247532844543,
554
- "learning_rate": 1.0183218268720129e-05,
555
- "loss": 0.5148,
556
- "step": 39000
557
- },
558
- {
559
- "epoch": 10.488582049920339,
560
- "grad_norm": 0.2848931550979614,
561
- "learning_rate": 9.672576494137833e-06,
562
- "loss": 0.5134,
563
- "step": 39500
564
- },
565
- {
566
- "epoch": 10.621348911311737,
567
- "grad_norm": 0.24945715069770813,
568
- "learning_rate": 9.161934719555536e-06,
569
- "loss": 0.5136,
570
- "step": 40000
571
- },
572
- {
573
- "epoch": 10.754115772703134,
574
- "grad_norm": 0.28524720668792725,
575
- "learning_rate": 8.651292944973242e-06,
576
- "loss": 0.5167,
577
- "step": 40500
578
- },
579
- {
580
- "epoch": 10.88688263409453,
581
- "grad_norm": 0.29454296827316284,
582
- "learning_rate": 8.140651170390948e-06,
583
- "loss": 0.5151,
584
- "step": 41000
585
- },
586
- {
587
- "epoch": 11.019649495485927,
588
- "grad_norm": 0.30919119715690613,
589
- "learning_rate": 7.632051962906982e-06,
590
- "loss": 0.5121,
591
- "step": 41500
592
- },
593
- {
594
- "epoch": 11.152416356877323,
595
- "grad_norm": 0.36948204040527344,
596
- "learning_rate": 7.121410188324687e-06,
597
- "loss": 0.5146,
598
- "step": 42000
599
- },
600
- {
601
- "epoch": 11.28518321826872,
602
- "grad_norm": 0.2883196771144867,
603
- "learning_rate": 6.610768413742392e-06,
604
- "loss": 0.5118,
605
- "step": 42500
606
- },
607
- {
608
- "epoch": 11.417950079660116,
609
- "grad_norm": 0.2851753532886505,
610
- "learning_rate": 6.100126639160097e-06,
611
- "loss": 0.5092,
612
- "step": 43000
613
- },
614
- {
615
- "epoch": 11.550716941051514,
616
- "grad_norm": 0.27395716309547424,
617
- "learning_rate": 5.5894848645778016e-06,
618
- "loss": 0.5044,
619
- "step": 43500
620
- },
621
- {
622
- "epoch": 11.683483802442911,
623
- "grad_norm": 0.2726575434207916,
624
- "learning_rate": 5.078843089995506e-06,
625
- "loss": 0.5106,
626
- "step": 44000
627
- },
628
- {
629
- "epoch": 11.816250663834307,
630
- "grad_norm": 0.29727038741111755,
631
- "learning_rate": 4.568201315413211e-06,
632
- "loss": 0.5095,
633
- "step": 44500
634
- },
635
- {
636
- "epoch": 11.949017525225704,
637
- "grad_norm": 0.2694978713989258,
638
- "learning_rate": 4.0575595408309166e-06,
639
- "loss": 0.5118,
640
- "step": 45000
641
- },
642
- {
643
- "epoch": 12.0817843866171,
644
- "grad_norm": 0.2318025678396225,
645
- "learning_rate": 3.5469177662486213e-06,
646
- "loss": 0.5126,
647
- "step": 45500
648
- },
649
- {
650
- "epoch": 12.214551248008497,
651
- "grad_norm": 0.27759501338005066,
652
- "learning_rate": 3.0362759916663264e-06,
653
- "loss": 0.5081,
654
- "step": 46000
655
- },
656
- {
657
- "epoch": 12.347318109399893,
658
- "grad_norm": 0.2869941294193268,
659
- "learning_rate": 2.525634217084031e-06,
660
- "loss": 0.5046,
661
- "step": 46500
662
- },
663
- {
664
- "epoch": 12.48008497079129,
665
- "grad_norm": 0.32994431257247925,
666
- "learning_rate": 2.0149924425017362e-06,
667
- "loss": 0.5104,
668
- "step": 47000
669
- },
670
- {
671
- "epoch": 12.612851832182688,
672
- "grad_norm": 0.28273916244506836,
673
- "learning_rate": 1.5053719514686058e-06,
674
- "loss": 0.5036,
675
- "step": 47500
676
- },
677
- {
678
- "epoch": 12.745618693574084,
679
- "grad_norm": 0.2604888379573822,
680
- "learning_rate": 9.947301768863107e-07,
681
- "loss": 0.5086,
682
- "step": 48000
683
- },
684
- {
685
- "epoch": 12.878385554965481,
686
- "grad_norm": 0.287817120552063,
687
- "learning_rate": 4.851096858531803e-07,
688
- "loss": 0.5126,
689
- "step": 48500
690
- }
691
- ],
692
- "logging_steps": 500,
693
- "max_steps": 48958,
694
- "num_input_tokens_seen": 0,
695
- "num_train_epochs": 13,
696
- "save_steps": 500,
697
- "stateful_callbacks": {
698
- "TrainerControl": {
699
- "args": {
700
- "should_epoch_stop": false,
701
- "should_evaluate": false,
702
- "should_log": false,
703
- "should_save": true,
704
- "should_training_stop": false
705
- },
706
- "attributes": {}
707
- }
708
- },
709
- "total_flos": 1.0501305718013952e+17,
710
- "train_batch_size": 32,
711
- "trial_name": null,
712
- "trial_params": null
713
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-48958/trainer_state.json DELETED
@@ -1,713 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 13.0,
6
- "eval_steps": 500,
7
- "global_step": 48958,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.1327668613913967,
14
- "grad_norm": 0.9114758372306824,
15
- "learning_rate": 4.9492422076065206e-05,
16
- "loss": 2.9213,
17
- "step": 500
18
- },
19
- {
20
- "epoch": 0.2655337227827934,
21
- "grad_norm": 0.8418619632720947,
22
- "learning_rate": 4.89817803014829e-05,
23
- "loss": 1.7378,
24
- "step": 1000
25
- },
26
- {
27
- "epoch": 0.3983005841741901,
28
- "grad_norm": 0.709621012210846,
29
- "learning_rate": 4.8471138526900614e-05,
30
- "loss": 1.4102,
31
- "step": 1500
32
- },
33
- {
34
- "epoch": 0.5310674455655868,
35
- "grad_norm": 0.4253033995628357,
36
- "learning_rate": 4.796049675231832e-05,
37
- "loss": 1.247,
38
- "step": 2000
39
- },
40
- {
41
- "epoch": 0.6638343069569835,
42
- "grad_norm": 0.4772707521915436,
43
- "learning_rate": 4.744985497773602e-05,
44
- "loss": 1.1408,
45
- "step": 2500
46
- },
47
- {
48
- "epoch": 0.7966011683483802,
49
- "grad_norm": 0.43140164017677307,
50
- "learning_rate": 4.6939213203153725e-05,
51
- "loss": 1.0649,
52
- "step": 3000
53
- },
54
- {
55
- "epoch": 0.929368029739777,
56
- "grad_norm": 0.39506375789642334,
57
- "learning_rate": 4.642857142857143e-05,
58
- "loss": 1.0004,
59
- "step": 3500
60
- },
61
- {
62
- "epoch": 1.0621348911311737,
63
- "grad_norm": 0.3946131765842438,
64
- "learning_rate": 4.591792965398913e-05,
65
- "loss": 0.9611,
66
- "step": 4000
67
- },
68
- {
69
- "epoch": 1.1949017525225702,
70
- "grad_norm": 0.3581591546535492,
71
- "learning_rate": 4.540728787940684e-05,
72
- "loss": 0.917,
73
- "step": 4500
74
- },
75
- {
76
- "epoch": 1.327668613913967,
77
- "grad_norm": 0.42795246839523315,
78
- "learning_rate": 4.489664610482455e-05,
79
- "loss": 0.8907,
80
- "step": 5000
81
- },
82
- {
83
- "epoch": 1.4604354753053639,
84
- "grad_norm": 0.4635187089443207,
85
- "learning_rate": 4.4386004330242245e-05,
86
- "loss": 0.8598,
87
- "step": 5500
88
- },
89
- {
90
- "epoch": 1.5932023366967605,
91
- "grad_norm": 0.3534747064113617,
92
- "learning_rate": 4.3875362555659955e-05,
93
- "loss": 0.8295,
94
- "step": 6000
95
- },
96
- {
97
- "epoch": 1.725969198088157,
98
- "grad_norm": 0.39130711555480957,
99
- "learning_rate": 4.336472078107766e-05,
100
- "loss": 0.808,
101
- "step": 6500
102
- },
103
- {
104
- "epoch": 1.858736059479554,
105
- "grad_norm": 0.440703809261322,
106
- "learning_rate": 4.285407900649537e-05,
107
- "loss": 0.7927,
108
- "step": 7000
109
- },
110
- {
111
- "epoch": 1.9915029208709507,
112
- "grad_norm": 0.3961372375488281,
113
- "learning_rate": 4.234343723191307e-05,
114
- "loss": 0.7828,
115
- "step": 7500
116
- },
117
- {
118
- "epoch": 2.1242697822623473,
119
- "grad_norm": 0.3364185690879822,
120
- "learning_rate": 4.1833816740879935e-05,
121
- "loss": 0.7628,
122
- "step": 8000
123
- },
124
- {
125
- "epoch": 2.257036643653744,
126
- "grad_norm": 0.34151241183280945,
127
- "learning_rate": 4.1323174966297646e-05,
128
- "loss": 0.7424,
129
- "step": 8500
130
- },
131
- {
132
- "epoch": 2.3898035050451405,
133
- "grad_norm": 0.3292118310928345,
134
- "learning_rate": 4.081253319171535e-05,
135
- "loss": 0.7341,
136
- "step": 9000
137
- },
138
- {
139
- "epoch": 2.5225703664365375,
140
- "grad_norm": 0.33259105682373047,
141
- "learning_rate": 4.0301891417133054e-05,
142
- "loss": 0.7212,
143
- "step": 9500
144
- },
145
- {
146
- "epoch": 2.655337227827934,
147
- "grad_norm": 0.35891205072402954,
148
- "learning_rate": 3.979124964255076e-05,
149
- "loss": 0.712,
150
- "step": 10000
151
- },
152
- {
153
- "epoch": 2.7881040892193307,
154
- "grad_norm": 0.3436354398727417,
155
- "learning_rate": 3.928060786796847e-05,
156
- "loss": 0.6975,
157
- "step": 10500
158
- },
159
- {
160
- "epoch": 2.9208709506107278,
161
- "grad_norm": 0.3285069465637207,
162
- "learning_rate": 3.8769966093386165e-05,
163
- "loss": 0.6881,
164
- "step": 11000
165
- },
166
- {
167
- "epoch": 3.0536378120021244,
168
- "grad_norm": 0.44189152121543884,
169
- "learning_rate": 3.826034560235304e-05,
170
- "loss": 0.6749,
171
- "step": 11500
172
- },
173
- {
174
- "epoch": 3.186404673393521,
175
- "grad_norm": 0.34346967935562134,
176
- "learning_rate": 3.7749703827770744e-05,
177
- "loss": 0.6704,
178
- "step": 12000
179
- },
180
- {
181
- "epoch": 3.3191715347849176,
182
- "grad_norm": 0.29128846526145935,
183
- "learning_rate": 3.723906205318845e-05,
184
- "loss": 0.6652,
185
- "step": 12500
186
- },
187
- {
188
- "epoch": 3.451938396176314,
189
- "grad_norm": 0.3013540208339691,
190
- "learning_rate": 3.672842027860615e-05,
191
- "loss": 0.6588,
192
- "step": 13000
193
- },
194
- {
195
- "epoch": 3.584705257567711,
196
- "grad_norm": 0.32138076424598694,
197
- "learning_rate": 3.6217778504023856e-05,
198
- "loss": 0.6453,
199
- "step": 13500
200
- },
201
- {
202
- "epoch": 3.717472118959108,
203
- "grad_norm": 0.3408374786376953,
204
- "learning_rate": 3.5707136729441566e-05,
205
- "loss": 0.6388,
206
- "step": 14000
207
- },
208
- {
209
- "epoch": 3.8502389803505044,
210
- "grad_norm": 0.9397606253623962,
211
- "learning_rate": 3.519649495485927e-05,
212
- "loss": 0.6349,
213
- "step": 14500
214
- },
215
- {
216
- "epoch": 3.9830058417419014,
217
- "grad_norm": 0.3192440867424011,
218
- "learning_rate": 3.4685853180276974e-05,
219
- "loss": 0.6291,
220
- "step": 15000
221
- },
222
- {
223
- "epoch": 4.115772703133298,
224
- "grad_norm": 0.3549179136753082,
225
- "learning_rate": 3.417521140569468e-05,
226
- "loss": 0.6278,
227
- "step": 15500
228
- },
229
- {
230
- "epoch": 4.248539564524695,
231
- "grad_norm": 0.3110153079032898,
232
- "learning_rate": 3.366456963111239e-05,
233
- "loss": 0.618,
234
- "step": 16000
235
- },
236
- {
237
- "epoch": 4.381306425916091,
238
- "grad_norm": 0.2719564735889435,
239
- "learning_rate": 3.3153927856530086e-05,
240
- "loss": 0.6169,
241
- "step": 16500
242
- },
243
- {
244
- "epoch": 4.514073287307488,
245
- "grad_norm": 0.2858710289001465,
246
- "learning_rate": 3.2643286081947796e-05,
247
- "loss": 0.61,
248
- "step": 17000
249
- },
250
- {
251
- "epoch": 4.646840148698884,
252
- "grad_norm": 0.31373563408851624,
253
- "learning_rate": 3.21326443073655e-05,
254
- "loss": 0.6011,
255
- "step": 17500
256
- },
257
- {
258
- "epoch": 4.779607010090281,
259
- "grad_norm": 0.29438045620918274,
260
- "learning_rate": 3.1622002532783204e-05,
261
- "loss": 0.5938,
262
- "step": 18000
263
- },
264
- {
265
- "epoch": 4.9123738714816785,
266
- "grad_norm": 0.3415851593017578,
267
- "learning_rate": 3.111238204175007e-05,
268
- "loss": 0.5992,
269
- "step": 18500
270
- },
271
- {
272
- "epoch": 5.045140732873075,
273
- "grad_norm": 0.35383546352386475,
274
- "learning_rate": 3.060276155071694e-05,
275
- "loss": 0.5871,
276
- "step": 19000
277
- },
278
- {
279
- "epoch": 5.177907594264472,
280
- "grad_norm": 0.3242381811141968,
281
- "learning_rate": 3.009314105968381e-05,
282
- "loss": 0.5867,
283
- "step": 19500
284
- },
285
- {
286
- "epoch": 5.310674455655868,
287
- "grad_norm": 0.28274649381637573,
288
- "learning_rate": 2.9582499285101516e-05,
289
- "loss": 0.584,
290
- "step": 20000
291
- },
292
- {
293
- "epoch": 5.443441317047265,
294
- "grad_norm": 0.3075231611728668,
295
- "learning_rate": 2.9071857510519223e-05,
296
- "loss": 0.584,
297
- "step": 20500
298
- },
299
- {
300
- "epoch": 5.5762081784386615,
301
- "grad_norm": 0.29568806290626526,
302
- "learning_rate": 2.8561215735936924e-05,
303
- "loss": 0.5743,
304
- "step": 21000
305
- },
306
- {
307
- "epoch": 5.708975039830058,
308
- "grad_norm": 0.32808518409729004,
309
- "learning_rate": 2.805057396135463e-05,
310
- "loss": 0.5757,
311
- "step": 21500
312
- },
313
- {
314
- "epoch": 5.8417419012214555,
315
- "grad_norm": 0.256596177816391,
316
- "learning_rate": 2.7539932186772338e-05,
317
- "loss": 0.5735,
318
- "step": 22000
319
- },
320
- {
321
- "epoch": 5.974508762612852,
322
- "grad_norm": 0.313557505607605,
323
- "learning_rate": 2.702929041219004e-05,
324
- "loss": 0.5679,
325
- "step": 22500
326
- },
327
- {
328
- "epoch": 6.107275624004249,
329
- "grad_norm": 0.274058997631073,
330
- "learning_rate": 2.6518648637607746e-05,
331
- "loss": 0.562,
332
- "step": 23000
333
- },
334
- {
335
- "epoch": 6.240042485395645,
336
- "grad_norm": 0.2777511477470398,
337
- "learning_rate": 2.6008006863025453e-05,
338
- "loss": 0.5619,
339
- "step": 23500
340
- },
341
- {
342
- "epoch": 6.372809346787042,
343
- "grad_norm": 0.3301125466823578,
344
- "learning_rate": 2.549736508844316e-05,
345
- "loss": 0.5598,
346
- "step": 24000
347
- },
348
- {
349
- "epoch": 6.5055762081784385,
350
- "grad_norm": 0.2844313383102417,
351
- "learning_rate": 2.498672331386086e-05,
352
- "loss": 0.5589,
353
- "step": 24500
354
- },
355
- {
356
- "epoch": 6.638343069569835,
357
- "grad_norm": 0.268718421459198,
358
- "learning_rate": 2.4476081539278568e-05,
359
- "loss": 0.5566,
360
- "step": 25000
361
- },
362
- {
363
- "epoch": 6.771109930961232,
364
- "grad_norm": 0.3230023980140686,
365
- "learning_rate": 2.3965439764696272e-05,
366
- "loss": 0.5582,
367
- "step": 25500
368
- },
369
- {
370
- "epoch": 6.903876792352628,
371
- "grad_norm": 0.27747681736946106,
372
- "learning_rate": 2.3454797990113976e-05,
373
- "loss": 0.5527,
374
- "step": 26000
375
- },
376
- {
377
- "epoch": 7.036643653744026,
378
- "grad_norm": 0.29863470792770386,
379
- "learning_rate": 2.2945177499080848e-05,
380
- "loss": 0.5491,
381
- "step": 26500
382
- },
383
- {
384
- "epoch": 7.169410515135422,
385
- "grad_norm": 0.30289873480796814,
386
- "learning_rate": 2.243453572449855e-05,
387
- "loss": 0.5468,
388
- "step": 27000
389
- },
390
- {
391
- "epoch": 7.302177376526819,
392
- "grad_norm": 0.2766277492046356,
393
- "learning_rate": 2.192491523346542e-05,
394
- "loss": 0.5444,
395
- "step": 27500
396
- },
397
- {
398
- "epoch": 7.434944237918216,
399
- "grad_norm": 0.3069545030593872,
400
- "learning_rate": 2.1414273458883124e-05,
401
- "loss": 0.5403,
402
- "step": 28000
403
- },
404
- {
405
- "epoch": 7.567711099309612,
406
- "grad_norm": 0.258329302072525,
407
- "learning_rate": 2.090363168430083e-05,
408
- "loss": 0.5453,
409
- "step": 28500
410
- },
411
- {
412
- "epoch": 7.700477960701009,
413
- "grad_norm": 0.2901703119277954,
414
- "learning_rate": 2.0392989909718535e-05,
415
- "loss": 0.5357,
416
- "step": 29000
417
- },
418
- {
419
- "epoch": 7.833244822092405,
420
- "grad_norm": 0.35300034284591675,
421
- "learning_rate": 1.988234813513624e-05,
422
- "loss": 0.541,
423
- "step": 29500
424
- },
425
- {
426
- "epoch": 7.966011683483803,
427
- "grad_norm": 0.2620261311531067,
428
- "learning_rate": 1.9371706360553946e-05,
429
- "loss": 0.5371,
430
- "step": 30000
431
- },
432
- {
433
- "epoch": 8.098778544875199,
434
- "grad_norm": 0.3098488450050354,
435
- "learning_rate": 1.886106458597165e-05,
436
- "loss": 0.5337,
437
- "step": 30500
438
- },
439
- {
440
- "epoch": 8.231545406266596,
441
- "grad_norm": 0.2904013991355896,
442
- "learning_rate": 1.8350422811389357e-05,
443
- "loss": 0.5342,
444
- "step": 31000
445
- },
446
- {
447
- "epoch": 8.364312267657992,
448
- "grad_norm": 0.29218047857284546,
449
- "learning_rate": 1.783978103680706e-05,
450
- "loss": 0.5323,
451
- "step": 31500
452
- },
453
- {
454
- "epoch": 8.49707912904939,
455
- "grad_norm": 0.3310258090496063,
456
- "learning_rate": 1.7329139262224765e-05,
457
- "loss": 0.5258,
458
- "step": 32000
459
- },
460
- {
461
- "epoch": 8.629845990440787,
462
- "grad_norm": 0.3069627583026886,
463
- "learning_rate": 1.6818497487642472e-05,
464
- "loss": 0.5299,
465
- "step": 32500
466
- },
467
- {
468
- "epoch": 8.762612851832182,
469
- "grad_norm": 0.24625258147716522,
470
- "learning_rate": 1.630887699660934e-05,
471
- "loss": 0.5285,
472
- "step": 33000
473
- },
474
- {
475
- "epoch": 8.89537971322358,
476
- "grad_norm": 0.26636838912963867,
477
- "learning_rate": 1.5798235222027044e-05,
478
- "loss": 0.5294,
479
- "step": 33500
480
- },
481
- {
482
- "epoch": 9.028146574614976,
483
- "grad_norm": 0.2842467725276947,
484
- "learning_rate": 1.5287593447444748e-05,
485
- "loss": 0.5235,
486
- "step": 34000
487
- },
488
- {
489
- "epoch": 9.160913436006373,
490
- "grad_norm": 0.3261110782623291,
491
- "learning_rate": 1.4776951672862455e-05,
492
- "loss": 0.5256,
493
- "step": 34500
494
- },
495
- {
496
- "epoch": 9.293680297397769,
497
- "grad_norm": 0.2750456929206848,
498
- "learning_rate": 1.4266309898280159e-05,
499
- "loss": 0.5218,
500
- "step": 35000
501
- },
502
- {
503
- "epoch": 9.426447158789166,
504
- "grad_norm": 0.26470229029655457,
505
- "learning_rate": 1.3755668123697864e-05,
506
- "loss": 0.522,
507
- "step": 35500
508
- },
509
- {
510
- "epoch": 9.559214020180562,
511
- "grad_norm": 0.24200379848480225,
512
- "learning_rate": 1.3245026349115568e-05,
513
- "loss": 0.5222,
514
- "step": 36000
515
- },
516
- {
517
- "epoch": 9.69198088157196,
518
- "grad_norm": 0.30407610535621643,
519
- "learning_rate": 1.2734384574533272e-05,
520
- "loss": 0.5208,
521
- "step": 36500
522
- },
523
- {
524
- "epoch": 9.824747742963357,
525
- "grad_norm": 0.26741334795951843,
526
- "learning_rate": 1.2224764083500144e-05,
527
- "loss": 0.5185,
528
- "step": 37000
529
- },
530
- {
531
- "epoch": 9.957514604354753,
532
- "grad_norm": 0.2811224162578583,
533
- "learning_rate": 1.1714122308917848e-05,
534
- "loss": 0.515,
535
- "step": 37500
536
- },
537
- {
538
- "epoch": 10.09028146574615,
539
- "grad_norm": 0.2725277543067932,
540
- "learning_rate": 1.1204501817884718e-05,
541
- "loss": 0.517,
542
- "step": 38000
543
- },
544
- {
545
- "epoch": 10.223048327137546,
546
- "grad_norm": 0.31137147545814514,
547
- "learning_rate": 1.0693860043302423e-05,
548
- "loss": 0.5155,
549
- "step": 38500
550
- },
551
- {
552
- "epoch": 10.355815188528943,
553
- "grad_norm": 0.26093247532844543,
554
- "learning_rate": 1.0183218268720129e-05,
555
- "loss": 0.5148,
556
- "step": 39000
557
- },
558
- {
559
- "epoch": 10.488582049920339,
560
- "grad_norm": 0.2848931550979614,
561
- "learning_rate": 9.672576494137833e-06,
562
- "loss": 0.5134,
563
- "step": 39500
564
- },
565
- {
566
- "epoch": 10.621348911311737,
567
- "grad_norm": 0.24945715069770813,
568
- "learning_rate": 9.161934719555536e-06,
569
- "loss": 0.5136,
570
- "step": 40000
571
- },
572
- {
573
- "epoch": 10.754115772703134,
574
- "grad_norm": 0.28524720668792725,
575
- "learning_rate": 8.651292944973242e-06,
576
- "loss": 0.5167,
577
- "step": 40500
578
- },
579
- {
580
- "epoch": 10.88688263409453,
581
- "grad_norm": 0.29454296827316284,
582
- "learning_rate": 8.140651170390948e-06,
583
- "loss": 0.5151,
584
- "step": 41000
585
- },
586
- {
587
- "epoch": 11.019649495485927,
588
- "grad_norm": 0.30919119715690613,
589
- "learning_rate": 7.632051962906982e-06,
590
- "loss": 0.5121,
591
- "step": 41500
592
- },
593
- {
594
- "epoch": 11.152416356877323,
595
- "grad_norm": 0.36948204040527344,
596
- "learning_rate": 7.121410188324687e-06,
597
- "loss": 0.5146,
598
- "step": 42000
599
- },
600
- {
601
- "epoch": 11.28518321826872,
602
- "grad_norm": 0.2883196771144867,
603
- "learning_rate": 6.610768413742392e-06,
604
- "loss": 0.5118,
605
- "step": 42500
606
- },
607
- {
608
- "epoch": 11.417950079660116,
609
- "grad_norm": 0.2851753532886505,
610
- "learning_rate": 6.100126639160097e-06,
611
- "loss": 0.5092,
612
- "step": 43000
613
- },
614
- {
615
- "epoch": 11.550716941051514,
616
- "grad_norm": 0.27395716309547424,
617
- "learning_rate": 5.5894848645778016e-06,
618
- "loss": 0.5044,
619
- "step": 43500
620
- },
621
- {
622
- "epoch": 11.683483802442911,
623
- "grad_norm": 0.2726575434207916,
624
- "learning_rate": 5.078843089995506e-06,
625
- "loss": 0.5106,
626
- "step": 44000
627
- },
628
- {
629
- "epoch": 11.816250663834307,
630
- "grad_norm": 0.29727038741111755,
631
- "learning_rate": 4.568201315413211e-06,
632
- "loss": 0.5095,
633
- "step": 44500
634
- },
635
- {
636
- "epoch": 11.949017525225704,
637
- "grad_norm": 0.2694978713989258,
638
- "learning_rate": 4.0575595408309166e-06,
639
- "loss": 0.5118,
640
- "step": 45000
641
- },
642
- {
643
- "epoch": 12.0817843866171,
644
- "grad_norm": 0.2318025678396225,
645
- "learning_rate": 3.5469177662486213e-06,
646
- "loss": 0.5126,
647
- "step": 45500
648
- },
649
- {
650
- "epoch": 12.214551248008497,
651
- "grad_norm": 0.27759501338005066,
652
- "learning_rate": 3.0362759916663264e-06,
653
- "loss": 0.5081,
654
- "step": 46000
655
- },
656
- {
657
- "epoch": 12.347318109399893,
658
- "grad_norm": 0.2869941294193268,
659
- "learning_rate": 2.525634217084031e-06,
660
- "loss": 0.5046,
661
- "step": 46500
662
- },
663
- {
664
- "epoch": 12.48008497079129,
665
- "grad_norm": 0.32994431257247925,
666
- "learning_rate": 2.0149924425017362e-06,
667
- "loss": 0.5104,
668
- "step": 47000
669
- },
670
- {
671
- "epoch": 12.612851832182688,
672
- "grad_norm": 0.28273916244506836,
673
- "learning_rate": 1.5053719514686058e-06,
674
- "loss": 0.5036,
675
- "step": 47500
676
- },
677
- {
678
- "epoch": 12.745618693574084,
679
- "grad_norm": 0.2604888379573822,
680
- "learning_rate": 9.947301768863107e-07,
681
- "loss": 0.5086,
682
- "step": 48000
683
- },
684
- {
685
- "epoch": 12.878385554965481,
686
- "grad_norm": 0.287817120552063,
687
- "learning_rate": 4.851096858531803e-07,
688
- "loss": 0.5126,
689
- "step": 48500
690
- }
691
- ],
692
- "logging_steps": 500,
693
- "max_steps": 48958,
694
- "num_input_tokens_seen": 0,
695
- "num_train_epochs": 13,
696
- "save_steps": 500,
697
- "stateful_callbacks": {
698
- "TrainerControl": {
699
- "args": {
700
- "should_epoch_stop": false,
701
- "should_evaluate": false,
702
- "should_log": false,
703
- "should_save": true,
704
- "should_training_stop": true
705
- },
706
- "attributes": {}
707
- }
708
- },
709
- "total_flos": 1.060038268378153e+17,
710
- "train_batch_size": 32,
711
- "trial_name": null,
712
- "trial_params": null
713
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8ea98de1cde992e950903fb96553ceb84e46b447461bc9f940922b80e9bc3c6
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c80a0cfc2db55db1dfa355590567e32e3a667c714291cca4e29780a242d11605
3
  size 242041896
src/data/clean_corpus.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f46dd6881eb75c95e2954fdccb43aa4c758f9ab2b7e6c049fa3a42a6768552b
3
- size 631303314
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d60854be8ec118d766e050b63140bbd56c124f9f0d357f8448c30cff10b0d92
3
+ size 628321869
src/data/generate_cyr_lat_pairs.py CHANGED
@@ -4,28 +4,59 @@ import json
4
 
5
  # Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
6
  cyrillic_to_latin = {
7
- "А": "A", "а": "a", "Ә": "Ä", "ә": "ä",
8
- "Б": "B", "б": "b", "В": "V", "в": "v",
9
- "Г": "G", "г": "g", "Ғ": "Ğ", "ғ": "ğ",
10
- "Д": "D", "д": "d", "Е": "E", "е": "e",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "Ё": "Io", "ё": "io",
12
- "Ж": "Zh", "ж": "zh", "З": "Z", "з": "z",
13
- "И": "I", "и": "i", "Й": "Y", "й": "y",
14
- "К": "K", "к": "k", "Қ": "Q", "қ": "q",
15
- "Л": "L", "л": "l", "М": "M", "м": "m",
16
- "Н": "N", "н": "n", "Ң": "Ñ", "ң": "ñ",
17
- "О": "O", "о": "o", "Ө": "Ö", "ө": "ö",
18
- "П": "P", "п": "p", "Р": "R", "р": "r",
19
- "С": "S", "с": "s", "Т": "T", "т": "t",
20
- "У": "U", "у": "u", "Ұ": "Ū", "ұ": "ū",
21
- "Ү": "Ü", "ү": "ü", "Ф": "F", "ф": "f",
22
- "Х": "H", "х": "h", "Һ": "H", "һ": "h",
23
- "Ц": "Ts", "ц": "ts", "Ч": "Tş", "ч": "tş",
24
- "Ш": "Ş", "ш": "ş", "Щ": "Ştş", "щ": "ştş",
25
- "Ъ": "", "ъ": "", "Ы": "Y", "ы": "y",
26
- "І": "I", "і": "i", "Ь": "", "ь": "",
27
- "Э": "E", "э": "e", "Ю": "Iu", "ю": "iu",
28
- "Я": "Ia", "я": "ia"
29
  }
30
 
31
 
 
4
 
5
  # Kazakh Cyrillic character to the Kazakh Latin character mapping from 2021 onwards
6
  cyrillic_to_latin = {
7
+ "А": "A", "а": "a",
8
+ "Ә": "Ä", "ә": "ä",
9
+ "Б": "B", "б": "b",
10
+ "Д": "D", "д": "d",
11
+ "Е": "E", "е": "e",
12
+ "Ф": "F", "ф": "f",
13
+ "Г": "G", "г": "g",
14
+ "Ғ": "Ğ", "ғ": "ğ",
15
+ "Х": "H", "х": "h", # also Һ, see below
16
+ "Һ": "H", "һ": "h",
17
+
18
+ "И": "I", "и": "i", # used for [и], [й]
19
+ "І": "I", "і": "i", # distinct from И in sound, both map to 'I/i'
20
+ "Ж": "J", "ж": "j",
21
+
22
+ "К": "K", "к": "k",
23
+ "Қ": "Q", "қ": "q",
24
+ "Л": "L", "л": "l",
25
+ "М": "M", "м": "m",
26
+ "Н": "N", "н": "n",
27
+ "Ң": "Ñ", "ң": "ñ",
28
+
29
+ "О": "O", "о": "o",
30
+ "Ө": "Ö", "ө": "ö",
31
+
32
+ "П": "P", "п": "p",
33
+ "Р": "R", "р": "r",
34
+ "С": "S", "с": "s",
35
+ "Ш": "Ş", "ш": "ş",
36
+ "Т": "T", "т": "t",
37
+
38
+ "У": "U", "у": "u", # basic 'u' sound, distinct from Ұ
39
+ "Ұ": "Ū", "ұ": "ū", # back rounded, used frequently
40
+ "Ү": "Ü", "ү": "ü", # front rounded
41
+
42
+ "В": "V", "в": "v",
43
+ "Ы": "Y", "ы": "y",
44
+ "Й": "I", "й": "i", # same treatment as И
45
+ "Ц": "Ts", "ц": "ts", # for Russian borrowings
46
+ "Ч": "Ch", "ч": "ch",
47
+ "Щ": "Ş", "щ": "ş", # typically simplified to 'ş'
48
+
49
+ "Э": "E", "э": "e",
50
+ "Ю": "Iu", "ю": "iu", # borrowed words only
51
+ "Я": "Ia", "я": "ia",
52
+
53
+ "Ъ": "", "ъ": "",
54
+ "Ь": "", "ь": "",
55
+
56
+ "З": "Z", "з": "z",
57
+
58
+ # Additional (not in table but used in borrowings)
59
  "Ё": "Io", "ё": "io",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
 
62
 
src/data/kazakh_latin_corpus.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f46dd6881eb75c95e2954fdccb43aa4c758f9ab2b7e6c049fa3a42a6768552b
3
- size 631303314
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d60854be8ec118d766e050b63140bbd56c124f9f0d357f8448c30cff10b0d92
3
+ size 628321869
src/train_t5.py CHANGED
@@ -49,7 +49,7 @@ data_collator = DataCollatorForSeq2Seq(tokenizer = tokeniser, model = model)
49
  training_args = TrainingArguments(
50
  output_dir = output_dir,
51
  overwrite_output_dir = True,
52
- num_train_epochs = 13,
53
  per_device_train_batch_size = 32,
54
  gradient_accumulation_steps = 2,
55
  save_strategy = "steps",
 
49
  training_args = TrainingArguments(
50
  output_dir = output_dir,
51
  overwrite_output_dir = True,
52
+ num_train_epochs = 15,
53
  per_device_train_batch_size = 32,
54
  gradient_accumulation_steps = 2,
55
  save_strategy = "steps",