mikhail-panzo commited on
Commit
f8d1b7d
1 Parent(s): 79ff126

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:669a57ca4489f0bb6c59a5df586177029446e718eaf5f36976301e8b45ee8cff
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65cc40d94c31ec94c9b98763487736e129c44c2fd99d58b70a1ed20a361a1eaa
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6542f5072af4b2e256a0fa8b74a638fed9b1baded6c85cb5c87f84a103b9def4
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adb391bb8e15ea2bb7244598232e46d31016900192d4dc478b9ba1a9edca860c
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d829676cdf8691bbda647d4511205df943fb386c557381ba360dda2aff5f1227
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:968291a21d5bd54cba05589e1f0110365f5d188066676d4b8853115288d17bb8
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15b560a9dc43b2ed5d8c7b0910cf19c12068f6e2db4cd26fd270e940d4d1787b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a04aa548ba1997bc55fa261cc2851282a66f1a8d19fe3862e3573f33f7d76f5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.39941468834877014,
3
- "best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s4000/checkpoint-1000",
4
- "epoch": 79.20792079207921,
5
  "eval_steps": 500,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -319,6 +319,84 @@
319
  "eval_samples_per_second": 24.13,
320
  "eval_steps_per_second": 3.083,
321
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  }
323
  ],
324
  "logging_steps": 50,
@@ -338,7 +416,7 @@
338
  "attributes": {}
339
  }
340
  },
341
- "total_flos": 2.161155031107257e+16,
342
  "train_batch_size": 16,
343
  "trial_name": null,
344
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3953019976615906,
3
+ "best_model_checkpoint": "mikhail_panzo/ceb_b64_le4_s4000/checkpoint-2500",
4
+ "epoch": 99.00990099009901,
5
  "eval_steps": 500,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
319
  "eval_samples_per_second": 24.13,
320
  "eval_steps_per_second": 3.083,
321
  "step": 2000
322
+ },
323
+ {
324
+ "epoch": 81.18811881188118,
325
+ "grad_norm": 2.13647723197937,
326
+ "learning_rate": 9.755000000000001e-05,
327
+ "loss": 0.3994,
328
+ "step": 2050
329
+ },
330
+ {
331
+ "epoch": 83.16831683168317,
332
+ "grad_norm": 1.8043389320373535,
333
+ "learning_rate": 9.505e-05,
334
+ "loss": 0.3982,
335
+ "step": 2100
336
+ },
337
+ {
338
+ "epoch": 85.14851485148515,
339
+ "grad_norm": 1.0940660238265991,
340
+ "learning_rate": 9.255e-05,
341
+ "loss": 0.3926,
342
+ "step": 2150
343
+ },
344
+ {
345
+ "epoch": 87.12871287128714,
346
+ "grad_norm": 2.0112838745117188,
347
+ "learning_rate": 9.005000000000001e-05,
348
+ "loss": 0.3822,
349
+ "step": 2200
350
+ },
351
+ {
352
+ "epoch": 89.10891089108911,
353
+ "grad_norm": 1.5353419780731201,
354
+ "learning_rate": 8.755e-05,
355
+ "loss": 0.3857,
356
+ "step": 2250
357
+ },
358
+ {
359
+ "epoch": 91.08910891089108,
360
+ "grad_norm": 2.0039117336273193,
361
+ "learning_rate": 8.505000000000001e-05,
362
+ "loss": 0.3887,
363
+ "step": 2300
364
+ },
365
+ {
366
+ "epoch": 93.06930693069307,
367
+ "grad_norm": 1.2098206281661987,
368
+ "learning_rate": 8.255e-05,
369
+ "loss": 0.3874,
370
+ "step": 2350
371
+ },
372
+ {
373
+ "epoch": 95.04950495049505,
374
+ "grad_norm": 0.9372404217720032,
375
+ "learning_rate": 8.005000000000001e-05,
376
+ "loss": 0.3816,
377
+ "step": 2400
378
+ },
379
+ {
380
+ "epoch": 97.02970297029702,
381
+ "grad_norm": 0.9278631806373596,
382
+ "learning_rate": 7.755e-05,
383
+ "loss": 0.3797,
384
+ "step": 2450
385
+ },
386
+ {
387
+ "epoch": 99.00990099009901,
388
+ "grad_norm": 1.0212403535842896,
389
+ "learning_rate": 7.505e-05,
390
+ "loss": 0.384,
391
+ "step": 2500
392
+ },
393
+ {
394
+ "epoch": 99.00990099009901,
395
+ "eval_loss": 0.3953019976615906,
396
+ "eval_runtime": 6.9648,
397
+ "eval_samples_per_second": 25.844,
398
+ "eval_steps_per_second": 3.302,
399
+ "step": 2500
400
  }
401
  ],
402
  "logging_steps": 50,
 
416
  "attributes": {}
417
  }
418
  },
419
+ "total_flos": 2.701326498107371e+16,
420
  "train_batch_size": 16,
421
  "trial_name": null,
422
  "trial_params": null