MeedoSam commited on
Commit
d2b9496
1 Parent(s): d9f7088

Uploaded checkpoint-4000

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8153c1ec03df5594efef9482f36db85ac0366821320fba1ba8aae357d7d7188
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae99966885dfadca210314bf64872ce443f70308df6e4727adcc50f428ab66db
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48f7a51201319a8a132484b82cfc2452f1693df833826e55a13467cec23fc927
3
  size 60477396
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2812ec63c28059aad0edb8123a9e90f5f8301e979f2372ce02fe039956e98169
3
  size 60477396
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89a6ecb1fb2aa6a3c4d8bd6fdae6076f15725b87d99a6f3bffd86e06ab5951a4
3
- size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b12fc07e36413d2b0b11012030944d448c215499606c7c88123ca1e537650ca8
3
+ size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:770db92ac44ccb712216aece2abb8a41e68fd6d952c7ae7884e9032fb3cc3f81
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f80b0441e18382140898e5947e4bf00161c8985bfd13094069daa8dad861cc8
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9399232396020991,
5
  "eval_steps": 100,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -457,6 +457,156 @@
457
  "eval_samples_per_second": 5.189,
458
  "eval_steps_per_second": 5.189,
459
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  }
461
  ],
462
  "logging_steps": 100,
@@ -464,7 +614,7 @@
464
  "num_input_tokens_seen": 0,
465
  "num_train_epochs": 2,
466
  "save_steps": 1000,
467
- "total_flos": 4.8306377981952e+16,
468
  "train_batch_size": 1,
469
  "trial_name": null,
470
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.2532309861361322,
5
  "eval_steps": 100,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
457
  "eval_samples_per_second": 5.189,
458
  "eval_steps_per_second": 5.189,
459
  "step": 3000
460
+ },
461
+ {
462
+ "epoch": 0.97,
463
+ "grad_norm": 0.0015150770777836442,
464
+ "learning_rate": 8.444444444444446e-06,
465
+ "loss": 0.055,
466
+ "step": 3100
467
+ },
468
+ {
469
+ "epoch": 0.97,
470
+ "eval_loss": 0.020349696278572083,
471
+ "eval_runtime": 192.752,
472
+ "eval_samples_per_second": 5.188,
473
+ "eval_steps_per_second": 5.188,
474
+ "step": 3100
475
+ },
476
+ {
477
+ "epoch": 1.0,
478
+ "grad_norm": 0.8284673690795898,
479
+ "learning_rate": 8.000000000000001e-06,
480
+ "loss": 0.0424,
481
+ "step": 3200
482
+ },
483
+ {
484
+ "epoch": 1.0,
485
+ "eval_loss": 0.011587778106331825,
486
+ "eval_runtime": 192.4082,
487
+ "eval_samples_per_second": 5.197,
488
+ "eval_steps_per_second": 5.197,
489
+ "step": 3200
490
+ },
491
+ {
492
+ "epoch": 1.03,
493
+ "grad_norm": 0.00324226007796824,
494
+ "learning_rate": 7.555555555555556e-06,
495
+ "loss": 0.0232,
496
+ "step": 3300
497
+ },
498
+ {
499
+ "epoch": 1.03,
500
+ "eval_loss": 0.02541309781372547,
501
+ "eval_runtime": 192.4753,
502
+ "eval_samples_per_second": 5.195,
503
+ "eval_steps_per_second": 5.195,
504
+ "step": 3300
505
+ },
506
+ {
507
+ "epoch": 1.07,
508
+ "grad_norm": 0.0018368299352005124,
509
+ "learning_rate": 7.111111111111112e-06,
510
+ "loss": 0.0391,
511
+ "step": 3400
512
+ },
513
+ {
514
+ "epoch": 1.07,
515
+ "eval_loss": 0.019817600026726723,
516
+ "eval_runtime": 192.2908,
517
+ "eval_samples_per_second": 5.2,
518
+ "eval_steps_per_second": 5.2,
519
+ "step": 3400
520
+ },
521
+ {
522
+ "epoch": 1.1,
523
+ "grad_norm": 0.001375267980620265,
524
+ "learning_rate": 6.666666666666667e-06,
525
+ "loss": 0.029,
526
+ "step": 3500
527
+ },
528
+ {
529
+ "epoch": 1.1,
530
+ "eval_loss": 0.014760646037757397,
531
+ "eval_runtime": 192.1713,
532
+ "eval_samples_per_second": 5.204,
533
+ "eval_steps_per_second": 5.204,
534
+ "step": 3500
535
+ },
536
+ {
537
+ "epoch": 1.13,
538
+ "grad_norm": 1.2048271894454956,
539
+ "learning_rate": 6.222222222222223e-06,
540
+ "loss": 0.028,
541
+ "step": 3600
542
+ },
543
+ {
544
+ "epoch": 1.13,
545
+ "eval_loss": 0.024096647277474403,
546
+ "eval_runtime": 191.7597,
547
+ "eval_samples_per_second": 5.215,
548
+ "eval_steps_per_second": 5.215,
549
+ "step": 3600
550
+ },
551
+ {
552
+ "epoch": 1.16,
553
+ "grad_norm": 1.7010436058044434,
554
+ "learning_rate": 5.777777777777778e-06,
555
+ "loss": 0.033,
556
+ "step": 3700
557
+ },
558
+ {
559
+ "epoch": 1.16,
560
+ "eval_loss": 0.024101875722408295,
561
+ "eval_runtime": 191.6566,
562
+ "eval_samples_per_second": 5.218,
563
+ "eval_steps_per_second": 5.218,
564
+ "step": 3700
565
+ },
566
+ {
567
+ "epoch": 1.19,
568
+ "grad_norm": 0.4044632613658905,
569
+ "learning_rate": 5.333333333333334e-06,
570
+ "loss": 0.0411,
571
+ "step": 3800
572
+ },
573
+ {
574
+ "epoch": 1.19,
575
+ "eval_loss": 0.020846880972385406,
576
+ "eval_runtime": 192.1837,
577
+ "eval_samples_per_second": 5.203,
578
+ "eval_steps_per_second": 5.203,
579
+ "step": 3800
580
+ },
581
+ {
582
+ "epoch": 1.22,
583
+ "grad_norm": 0.47499576210975647,
584
+ "learning_rate": 4.888888888888889e-06,
585
+ "loss": 0.0233,
586
+ "step": 3900
587
+ },
588
+ {
589
+ "epoch": 1.22,
590
+ "eval_loss": 0.010989435017108917,
591
+ "eval_runtime": 192.4351,
592
+ "eval_samples_per_second": 5.197,
593
+ "eval_steps_per_second": 5.197,
594
+ "step": 3900
595
+ },
596
+ {
597
+ "epoch": 1.25,
598
+ "grad_norm": 0.017001571133732796,
599
+ "learning_rate": 4.444444444444444e-06,
600
+ "loss": 0.0204,
601
+ "step": 4000
602
+ },
603
+ {
604
+ "epoch": 1.25,
605
+ "eval_loss": 0.02116994932293892,
606
+ "eval_runtime": 192.8643,
607
+ "eval_samples_per_second": 5.185,
608
+ "eval_steps_per_second": 5.185,
609
+ "step": 4000
610
  }
611
  ],
612
  "logging_steps": 100,
 
614
  "num_input_tokens_seen": 0,
615
  "num_train_epochs": 2,
616
  "save_steps": 1000,
617
+ "total_flos": 6.4408503975936e+16,
618
  "train_batch_size": 1,
619
  "trial_name": null,
620
  "trial_params": null