mikhail-panzo commited on
Commit
61f067a
1 Parent(s): e7439cd

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7308e6252b959616b2fe7c9a1e30cbe6a34ffb2d029f50db793552fbd2bdbfbc
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a6f08fab5c9cd3e3824141f661d04fab419e16f0f27aa8be427a55a045a5024
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e080c2a47685ca953b3b9d5095ee4227f7d627920903f3549b23b13a6c86e24
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d2fcaa714da200edbccb39a9f2728d1eac302590a189b7a1ad33e51ddded74
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f01983801a45234478c13db780514a852cdeaff2aa79f279442e47dc68cb11d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a3df38c17b475ea7e47b6c462cb3838a91cdec2a3f47a69fc35d66481cfa21
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15b560a9dc43b2ed5d8c7b0910cf19c12068f6e2db4cd26fd270e940d4d1787b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b382f91d6a62c99969921e0d8014a3b1f89a198a6a81ab888e1194bc144c13d5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.36490994691848755,
3
- "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-2000",
4
- "epoch": 3.350785340314136,
5
  "eval_steps": 500,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -319,6 +319,84 @@
319
  "eval_samples_per_second": 31.924,
320
  "eval_steps_per_second": 3.994,
321
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  }
323
  ],
324
  "logging_steps": 50,
@@ -338,7 +416,7 @@
338
  "attributes": {}
339
  }
340
  },
341
- "total_flos": 3.5804068604023104e+16,
342
  "train_batch_size": 16,
343
  "trial_name": null,
344
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3528364896774292,
3
+ "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-2500",
4
+ "epoch": 4.18848167539267,
5
  "eval_steps": 500,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
319
  "eval_samples_per_second": 31.924,
320
  "eval_steps_per_second": 3.994,
321
  "step": 2000
322
+ },
323
+ {
324
+ "epoch": 3.4345549738219896,
325
+ "grad_norm": 2.121384859085083,
326
+ "learning_rate": 9.918333333333334e-05,
327
+ "loss": 0.4058,
328
+ "step": 2050
329
+ },
330
+ {
331
+ "epoch": 3.518324607329843,
332
+ "grad_norm": 1.645984411239624,
333
+ "learning_rate": 9.835e-05,
334
+ "loss": 0.4021,
335
+ "step": 2100
336
+ },
337
+ {
338
+ "epoch": 3.6020942408376966,
339
+ "grad_norm": 1.246239185333252,
340
+ "learning_rate": 9.751666666666666e-05,
341
+ "loss": 0.3991,
342
+ "step": 2150
343
+ },
344
+ {
345
+ "epoch": 3.6858638743455496,
346
+ "grad_norm": 1.9096795320510864,
347
+ "learning_rate": 9.668333333333334e-05,
348
+ "loss": 0.3961,
349
+ "step": 2200
350
+ },
351
+ {
352
+ "epoch": 3.769633507853403,
353
+ "grad_norm": 1.8867601156234741,
354
+ "learning_rate": 9.585000000000001e-05,
355
+ "loss": 0.3904,
356
+ "step": 2250
357
+ },
358
+ {
359
+ "epoch": 3.8534031413612566,
360
+ "grad_norm": 1.7438101768493652,
361
+ "learning_rate": 9.501666666666668e-05,
362
+ "loss": 0.3895,
363
+ "step": 2300
364
+ },
365
+ {
366
+ "epoch": 3.93717277486911,
367
+ "grad_norm": 1.1799490451812744,
368
+ "learning_rate": 9.418333333333334e-05,
369
+ "loss": 0.4027,
370
+ "step": 2350
371
+ },
372
+ {
373
+ "epoch": 4.020942408376963,
374
+ "grad_norm": 1.1952763795852661,
375
+ "learning_rate": 9.335e-05,
376
+ "loss": 0.3893,
377
+ "step": 2400
378
+ },
379
+ {
380
+ "epoch": 4.104712041884817,
381
+ "grad_norm": 2.008756160736084,
382
+ "learning_rate": 9.251666666666667e-05,
383
+ "loss": 0.3878,
384
+ "step": 2450
385
+ },
386
+ {
387
+ "epoch": 4.18848167539267,
388
+ "grad_norm": 2.2693591117858887,
389
+ "learning_rate": 9.168333333333333e-05,
390
+ "loss": 0.3863,
391
+ "step": 2500
392
+ },
393
+ {
394
+ "epoch": 4.18848167539267,
395
+ "eval_loss": 0.3528364896774292,
396
+ "eval_runtime": 272.7627,
397
+ "eval_samples_per_second": 31.122,
398
+ "eval_steps_per_second": 3.893,
399
+ "step": 2500
400
  }
401
  ],
402
  "logging_steps": 50,
 
416
  "attributes": {}
417
  }
418
  },
419
+ "total_flos": 4.477753664307475e+16,
420
  "train_batch_size": 16,
421
  "trial_name": null,
422
  "trial_params": null