mikhail-panzo commited on
Commit
5f23162
1 Parent(s): ed9b985

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca029c96d767da45b7323e47c97d5e6f73b5c0d8dc25a679391682e01d0116f3
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6594507d2093f7d04ea48dba9b17f445e9541e34c9ab8598b25c91de8e5ab89
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc2a528c117c0804a8911914e1d71117feb3539aa52b1dcd1672a20000241a5c
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7a50c86b765550303567a7c71dc3287d2a1793b843374f3662606fefc4c7b7
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1408fe68df18c0e59db68d59ba316117312bd780df72ef23ec45a7e9f2b2bcd9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e8aa2535c3a1ec264ce2c31a7ac416d4861404181dd39ae479182880849bcf2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f32a07717606d664792d4ebfd434223710fe948a637ff5f34234da98aa96ac43
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8452e8180915c2a4933789804dc213986b2971ca2876fd0285cc8294b6a4c56
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.4116212725639343,
3
- "best_model_checkpoint": "mikhail-panzo/ceb_b64_le5_s8000/checkpoint-1500",
4
- "epoch": 58.8235294117647,
5
  "eval_steps": 500,
6
- "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -241,6 +241,162 @@
241
  "eval_samples_per_second": 28.665,
242
  "eval_steps_per_second": 3.663,
243
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  }
245
  ],
246
  "logging_steps": 50,
@@ -260,7 +416,7 @@
260
  "attributes": {}
261
  }
262
  },
263
- "total_flos": 1.6328687985209952e+16,
264
  "train_batch_size": 32,
265
  "trial_name": null,
266
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3997121751308441,
3
+ "best_model_checkpoint": "mikhail-panzo/ceb_b64_le5_s8000/checkpoint-2500",
4
+ "epoch": 98.03921568627452,
5
  "eval_steps": 500,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
241
  "eval_samples_per_second": 28.665,
242
  "eval_steps_per_second": 3.663,
243
  "step": 1500
244
+ },
245
+ {
246
+ "epoch": 60.78431372549019,
247
+ "grad_norm": 0.9767690896987915,
248
+ "learning_rate": 7.745e-06,
249
+ "loss": 0.4517,
250
+ "step": 1550
251
+ },
252
+ {
253
+ "epoch": 62.745098039215684,
254
+ "grad_norm": 1.4193260669708252,
255
+ "learning_rate": 7.995e-06,
256
+ "loss": 0.4537,
257
+ "step": 1600
258
+ },
259
+ {
260
+ "epoch": 64.70588235294117,
261
+ "grad_norm": 1.3294360637664795,
262
+ "learning_rate": 8.245000000000002e-06,
263
+ "loss": 0.4435,
264
+ "step": 1650
265
+ },
266
+ {
267
+ "epoch": 66.66666666666667,
268
+ "grad_norm": 0.8386899828910828,
269
+ "learning_rate": 8.495e-06,
270
+ "loss": 0.4507,
271
+ "step": 1700
272
+ },
273
+ {
274
+ "epoch": 68.62745098039215,
275
+ "grad_norm": 1.0917119979858398,
276
+ "learning_rate": 8.745000000000002e-06,
277
+ "loss": 0.4409,
278
+ "step": 1750
279
+ },
280
+ {
281
+ "epoch": 70.58823529411765,
282
+ "grad_norm": 1.0725489854812622,
283
+ "learning_rate": 8.995000000000001e-06,
284
+ "loss": 0.4449,
285
+ "step": 1800
286
+ },
287
+ {
288
+ "epoch": 72.54901960784314,
289
+ "grad_norm": 1.3506999015808105,
290
+ "learning_rate": 9.245e-06,
291
+ "loss": 0.4496,
292
+ "step": 1850
293
+ },
294
+ {
295
+ "epoch": 74.50980392156863,
296
+ "grad_norm": 0.9701379537582397,
297
+ "learning_rate": 9.495000000000001e-06,
298
+ "loss": 0.4384,
299
+ "step": 1900
300
+ },
301
+ {
302
+ "epoch": 76.47058823529412,
303
+ "grad_norm": 1.7079219818115234,
304
+ "learning_rate": 9.745e-06,
305
+ "loss": 0.4374,
306
+ "step": 1950
307
+ },
308
+ {
309
+ "epoch": 78.43137254901961,
310
+ "grad_norm": 1.87998628616333,
311
+ "learning_rate": 9.995000000000002e-06,
312
+ "loss": 0.4346,
313
+ "step": 2000
314
+ },
315
+ {
316
+ "epoch": 78.43137254901961,
317
+ "eval_loss": 0.4027920663356781,
318
+ "eval_runtime": 6.3546,
319
+ "eval_samples_per_second": 28.326,
320
+ "eval_steps_per_second": 3.619,
321
+ "step": 2000
322
+ },
323
+ {
324
+ "epoch": 80.3921568627451,
325
+ "grad_norm": 1.1510419845581055,
326
+ "learning_rate": 9.918333333333335e-06,
327
+ "loss": 0.4326,
328
+ "step": 2050
329
+ },
330
+ {
331
+ "epoch": 82.3529411764706,
332
+ "grad_norm": 1.2605654001235962,
333
+ "learning_rate": 9.835000000000002e-06,
334
+ "loss": 0.4355,
335
+ "step": 2100
336
+ },
337
+ {
338
+ "epoch": 84.31372549019608,
339
+ "grad_norm": 0.866606593132019,
340
+ "learning_rate": 9.751666666666667e-06,
341
+ "loss": 0.4286,
342
+ "step": 2150
343
+ },
344
+ {
345
+ "epoch": 86.27450980392157,
346
+ "grad_norm": 2.0733227729797363,
347
+ "learning_rate": 9.668333333333334e-06,
348
+ "loss": 0.4365,
349
+ "step": 2200
350
+ },
351
+ {
352
+ "epoch": 88.23529411764706,
353
+ "grad_norm": 0.9726402759552002,
354
+ "learning_rate": 9.585e-06,
355
+ "loss": 0.4367,
356
+ "step": 2250
357
+ },
358
+ {
359
+ "epoch": 90.19607843137256,
360
+ "grad_norm": 1.0713222026824951,
361
+ "learning_rate": 9.501666666666667e-06,
362
+ "loss": 0.4288,
363
+ "step": 2300
364
+ },
365
+ {
366
+ "epoch": 92.15686274509804,
367
+ "grad_norm": 1.5218483209609985,
368
+ "learning_rate": 9.418333333333334e-06,
369
+ "loss": 0.435,
370
+ "step": 2350
371
+ },
372
+ {
373
+ "epoch": 94.11764705882354,
374
+ "grad_norm": 0.8391968011856079,
375
+ "learning_rate": 9.335000000000001e-06,
376
+ "loss": 0.431,
377
+ "step": 2400
378
+ },
379
+ {
380
+ "epoch": 96.07843137254902,
381
+ "grad_norm": 1.3989890813827515,
382
+ "learning_rate": 9.251666666666668e-06,
383
+ "loss": 0.4251,
384
+ "step": 2450
385
+ },
386
+ {
387
+ "epoch": 98.03921568627452,
388
+ "grad_norm": 0.9168123006820679,
389
+ "learning_rate": 9.168333333333333e-06,
390
+ "loss": 0.4292,
391
+ "step": 2500
392
+ },
393
+ {
394
+ "epoch": 98.03921568627452,
395
+ "eval_loss": 0.3997121751308441,
396
+ "eval_runtime": 6.4036,
397
+ "eval_samples_per_second": 28.109,
398
+ "eval_steps_per_second": 3.592,
399
+ "step": 2500
400
  }
401
  ],
402
  "logging_steps": 50,
 
416
  "attributes": {}
417
  }
418
  },
419
+ "total_flos": 2.7204142566350376e+16,
420
  "train_batch_size": 32,
421
  "trial_name": null,
422
  "trial_params": null