AlekseyKorshuk commited on
Commit
d439efe
1 Parent(s): cb31c93

huggingartists

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ widget:
14
  <div class="inline-flex flex-col" style="line-height: 1.5;">
15
  <div class="flex">
16
  <div
17
- style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/e4051ba629f57622a2a35571e5676824.720x720x1.jpg&#39;)">
18
  </div>
19
  </div>
20
  <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/madonna")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1zgkgxrp/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Madonna's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1nejy753) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1nejy753/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
14
  <div class="inline-flex flex-col" style="line-height: 1.5;">
15
  <div class="flex">
16
  <div
17
+ style="display:DISPLAY_1; margin-left: auto; margin-right: auto; width: 92px; height:92px; border-radius: 50%; background-size: cover; background-image: url(&#39;https://images.genius.com/676c1c425eaa8e7600136c56af6dfada.1000x1000x1.jpg&#39;)">
18
  </div>
19
  </div>
20
  <div style="text-align: center; margin-top: 3px; font-size: 16px; font-weight: 800">🤖 HuggingArtists Model 🤖</div>
 
45
  dataset = load_dataset("huggingartists/madonna")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2abhif57/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Madonna's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/2eok9fmu) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/2eok9fmu/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
config.json CHANGED
@@ -37,7 +37,7 @@
37
  }
38
  },
39
  "torch_dtype": "float32",
40
- "transformers_version": "4.15.0",
41
  "use_cache": true,
42
  "vocab_size": 50257
43
  }
 
37
  }
38
  },
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.20.0",
41
  "use_cache": true,
42
  "vocab_size": 50257
43
  }
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 1.5412778854370117, "eval_runtime": 17.7742, "eval_samples_per_second": 17.272, "eval_steps_per_second": 2.194, "epoch": 2.0}
 
1
+ {"eval_loss": 1.1562458276748657, "eval_runtime": 2.2356, "eval_samples_per_second": 140.904, "eval_steps_per_second": 17.893, "epoch": 7.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25566d27aa0a638f5602254e69031264732abeaa33b25755bd5b7e9febdce8f7
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b183f932b6e1b0f15eb8b249be6289dc2bba247f996bd190b20496df0370caea
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc55df2228257ecd9fbf03f2979f7659de46424b72ffa514ec925c65c4ceaf50
3
  size 995604017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea3f7591a0adca3a3b98dc49de974b7d297f3f1539fe9164e7fe7c3495012346
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cffe2698db26bffad1da8bea41d71775f16f1b35e022d24c8c80b1ca1297a533
3
- size 510403817
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a820a71cbebdc8c969a77130a86384b35e8bcc137c87817368b069d5ec9e35fd
3
+ size 510396521
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0706774df0979b928412046725f3584988207df65c21b575ba45213c1e46e201
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d46dbf93ee6615ec3fae58d5cd6ee398ed5760083a374226895338bf2439d78
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73d66e37400481ce71994edfed806089e04a5cb8f448f68f1ec267b591d1dbe6
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d21bbb47b7bf28ce88efd32118fc5bbd03f4e4f2ef87c2963fecf564d95c240
3
  size 623
special_tokens_map.json CHANGED
@@ -1 +1,5 @@
1
- {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1 +1,10 @@
1
- {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "huggingartists/madonna", "tokenizer_class": "GPT2Tokenizer"}
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "huggingartists/madonna",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 1.5412778854370117,
3
- "best_model_checkpoint": "output/madonna/checkpoint-448",
4
- "epoch": 2.0,
5
- "global_step": 448,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -564,11 +564,1389 @@
564
  "eval_samples_per_second": 17.282,
565
  "eval_steps_per_second": 2.195,
566
  "step": 448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  }
568
  ],
569
- "max_steps": 448,
570
- "num_train_epochs": 2,
571
- "total_flos": 467712737280000.0,
572
  "trial_name": null,
573
  "trial_params": null
574
  }
 
1
  {
2
+ "best_metric": 1.1562458276748657,
3
+ "best_model_checkpoint": "output/madonna/checkpoint-1561",
4
+ "epoch": 7.0,
5
+ "global_step": 1561,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
564
  "eval_samples_per_second": 17.282,
565
  "eval_steps_per_second": 2.195,
566
  "step": 448
567
+ },
568
+ {
569
+ "epoch": 2.02,
570
+ "learning_rate": 0.00013709110969999672,
571
+ "loss": 1.6748,
572
+ "step": 450
573
+ },
574
+ {
575
+ "epoch": 2.04,
576
+ "learning_rate": 0.0001366493352528768,
577
+ "loss": 1.7921,
578
+ "step": 455
579
+ },
580
+ {
581
+ "epoch": 2.06,
582
+ "learning_rate": 0.00013587006036997336,
583
+ "loss": 1.6524,
584
+ "step": 460
585
+ },
586
+ {
587
+ "epoch": 2.09,
588
+ "learning_rate": 0.00013475714997680845,
589
+ "loss": 1.5413,
590
+ "step": 465
591
+ },
592
+ {
593
+ "epoch": 2.11,
594
+ "learning_rate": 0.00013331612371205717,
595
+ "loss": 1.5745,
596
+ "step": 470
597
+ },
598
+ {
599
+ "epoch": 2.13,
600
+ "learning_rate": 0.0001315541285521084,
601
+ "loss": 1.4608,
602
+ "step": 475
603
+ },
604
+ {
605
+ "epoch": 2.15,
606
+ "learning_rate": 0.0001294799033646105,
607
+ "loss": 1.637,
608
+ "step": 480
609
+ },
610
+ {
611
+ "epoch": 2.17,
612
+ "learning_rate": 0.00012710373556680405,
613
+ "loss": 1.5212,
614
+ "step": 485
615
+ },
616
+ {
617
+ "epoch": 2.2,
618
+ "learning_rate": 0.00012443741010360104,
619
+ "loss": 1.3742,
620
+ "step": 490
621
+ },
622
+ {
623
+ "epoch": 2.22,
624
+ "learning_rate": 0.00012149415099846082,
625
+ "loss": 1.5598,
626
+ "step": 495
627
+ },
628
+ {
629
+ "epoch": 2.24,
630
+ "learning_rate": 0.0001182885557669493,
631
+ "loss": 1.5528,
632
+ "step": 500
633
+ },
634
+ {
635
+ "epoch": 2.26,
636
+ "learning_rate": 0.00011483652301826763,
637
+ "loss": 1.3306,
638
+ "step": 505
639
+ },
640
+ {
641
+ "epoch": 2.29,
642
+ "learning_rate": 0.00011115517360381999,
643
+ "loss": 1.6203,
644
+ "step": 510
645
+ },
646
+ {
647
+ "epoch": 2.31,
648
+ "learning_rate": 0.00010726276570389622,
649
+ "loss": 1.5496,
650
+ "step": 515
651
+ },
652
+ {
653
+ "epoch": 2.33,
654
+ "learning_rate": 0.00010317860427360839,
655
+ "loss": 1.5886,
656
+ "step": 520
657
+ },
658
+ {
659
+ "epoch": 2.35,
660
+ "learning_rate": 9.892294529719824e-05,
661
+ "loss": 1.5832,
662
+ "step": 525
663
+ },
664
+ {
665
+ "epoch": 2.38,
666
+ "learning_rate": 9.451689532557822e-05,
667
+ "loss": 1.374,
668
+ "step": 530
669
+ },
670
+ {
671
+ "epoch": 2.4,
672
+ "learning_rate": 8.998230679536645e-05,
673
+ "loss": 1.615,
674
+ "step": 535
675
+ },
676
+ {
677
+ "epoch": 2.42,
678
+ "learning_rate": 8.534166964859326e-05,
679
+ "loss": 1.2303,
680
+ "step": 540
681
+ },
682
+ {
683
+ "epoch": 2.44,
684
+ "learning_rate": 8.061799979060907e-05,
685
+ "loss": 1.5842,
686
+ "step": 545
687
+ },
688
+ {
689
+ "epoch": 2.47,
690
+ "learning_rate": 7.5834724939402e-05,
691
+ "loss": 1.6827,
692
+ "step": 550
693
+ },
694
+ {
695
+ "epoch": 2.49,
696
+ "learning_rate": 7.10155684324705e-05,
697
+ "loss": 1.3135,
698
+ "step": 555
699
+ },
700
+ {
701
+ "epoch": 2.51,
702
+ "learning_rate": 6.618443156752956e-05,
703
+ "loss": 1.2597,
704
+ "step": 560
705
+ },
706
+ {
707
+ "epoch": 2.53,
708
+ "learning_rate": 6.136527506059804e-05,
709
+ "loss": 1.3309,
710
+ "step": 565
711
+ },
712
+ {
713
+ "epoch": 2.56,
714
+ "learning_rate": 5.658200020939097e-05,
715
+ "loss": 1.3751,
716
+ "step": 570
717
+ },
718
+ {
719
+ "epoch": 2.58,
720
+ "learning_rate": 5.185833035140678e-05,
721
+ "loss": 1.4378,
722
+ "step": 575
723
+ },
724
+ {
725
+ "epoch": 2.6,
726
+ "learning_rate": 4.721769320463358e-05,
727
+ "loss": 1.3661,
728
+ "step": 580
729
+ },
730
+ {
731
+ "epoch": 2.62,
732
+ "learning_rate": 4.268310467442181e-05,
733
+ "loss": 1.4564,
734
+ "step": 585
735
+ },
736
+ {
737
+ "epoch": 2.65,
738
+ "learning_rate": 3.827705470280179e-05,
739
+ "loss": 1.2126,
740
+ "step": 590
741
+ },
742
+ {
743
+ "epoch": 2.67,
744
+ "learning_rate": 3.4021395726391656e-05,
745
+ "loss": 1.3859,
746
+ "step": 595
747
+ },
748
+ {
749
+ "epoch": 2.69,
750
+ "learning_rate": 2.9937234296103807e-05,
751
+ "loss": 1.4162,
752
+ "step": 600
753
+ },
754
+ {
755
+ "epoch": 2.71,
756
+ "learning_rate": 2.6044826396180043e-05,
757
+ "loss": 1.3521,
758
+ "step": 605
759
+ },
760
+ {
761
+ "epoch": 2.74,
762
+ "learning_rate": 2.236347698173244e-05,
763
+ "loss": 1.4423,
764
+ "step": 610
765
+ },
766
+ {
767
+ "epoch": 2.76,
768
+ "learning_rate": 1.8911444233050724e-05,
769
+ "loss": 1.2658,
770
+ "step": 615
771
+ },
772
+ {
773
+ "epoch": 2.78,
774
+ "learning_rate": 1.5705849001539244e-05,
775
+ "loss": 1.2766,
776
+ "step": 620
777
+ },
778
+ {
779
+ "epoch": 2.8,
780
+ "learning_rate": 1.2762589896399027e-05,
781
+ "loss": 1.2592,
782
+ "step": 625
783
+ },
784
+ {
785
+ "epoch": 2.83,
786
+ "learning_rate": 1.0096264433195999e-05,
787
+ "loss": 1.4557,
788
+ "step": 630
789
+ },
790
+ {
791
+ "epoch": 2.85,
792
+ "learning_rate": 7.72009663538956e-06,
793
+ "loss": 1.4331,
794
+ "step": 635
795
+ },
796
+ {
797
+ "epoch": 2.87,
798
+ "learning_rate": 5.6458714478916046e-06,
799
+ "loss": 1.5097,
800
+ "step": 640
801
+ },
802
+ {
803
+ "epoch": 2.89,
804
+ "learning_rate": 3.883876287942861e-06,
805
+ "loss": 1.474,
806
+ "step": 645
807
+ },
808
+ {
809
+ "epoch": 2.91,
810
+ "learning_rate": 2.4428500231915692e-06,
811
+ "loss": 1.4667,
812
+ "step": 650
813
+ },
814
+ {
815
+ "epoch": 2.94,
816
+ "learning_rate": 1.32993963002664e-06,
817
+ "loss": 1.3699,
818
+ "step": 655
819
+ },
820
+ {
821
+ "epoch": 2.96,
822
+ "learning_rate": 5.506647471231935e-07,
823
+ "loss": 1.4343,
824
+ "step": 660
825
+ },
826
+ {
827
+ "epoch": 2.98,
828
+ "learning_rate": 1.0889030000328246e-07,
829
+ "loss": 1.3816,
830
+ "step": 665
831
+ },
832
+ {
833
+ "epoch": 3.0,
834
+ "eval_loss": 1.3440579175949097,
835
+ "eval_runtime": 2.2108,
836
+ "eval_samples_per_second": 142.485,
837
+ "eval_steps_per_second": 18.093,
838
+ "step": 669
839
+ },
840
+ {
841
+ "epoch": 3.0,
842
+ "learning_rate": 6.807332383425125e-09,
843
+ "loss": 1.2189,
844
+ "step": 670
845
+ },
846
+ {
847
+ "epoch": 3.03,
848
+ "learning_rate": 2.4492213937922245e-07,
849
+ "loss": 1.2422,
850
+ "step": 675
851
+ },
852
+ {
853
+ "epoch": 3.05,
854
+ "learning_rate": 8.22053756462089e-07,
855
+ "loss": 1.3055,
856
+ "step": 680
857
+ },
858
+ {
859
+ "epoch": 3.07,
860
+ "learning_rate": 1.735339816622185e-06,
861
+ "loss": 1.4475,
862
+ "step": 685
863
+ },
864
+ {
865
+ "epoch": 3.09,
866
+ "learning_rate": 2.980250746687993e-06,
867
+ "loss": 1.2442,
868
+ "step": 690
869
+ },
870
+ {
871
+ "epoch": 3.12,
872
+ "learning_rate": 4.550612232394036e-06,
873
+ "loss": 1.4277,
874
+ "step": 695
875
+ },
876
+ {
877
+ "epoch": 3.14,
878
+ "learning_rate": 6.438635840777816e-06,
879
+ "loss": 1.3777,
880
+ "step": 700
881
+ },
882
+ {
883
+ "epoch": 3.16,
884
+ "learning_rate": 8.634957648029665e-06,
885
+ "loss": 1.3328,
886
+ "step": 705
887
+ },
888
+ {
889
+ "epoch": 3.18,
890
+ "learning_rate": 1.1128684681215911e-05,
891
+ "loss": 1.3339,
892
+ "step": 710
893
+ },
894
+ {
895
+ "epoch": 3.21,
896
+ "learning_rate": 1.3907448943539908e-05,
897
+ "loss": 1.2034,
898
+ "step": 715
899
+ },
900
+ {
901
+ "epoch": 3.23,
902
+ "learning_rate": 1.6957468755195406e-05,
903
+ "loss": 1.1218,
904
+ "step": 720
905
+ },
906
+ {
907
+ "epoch": 3.25,
908
+ "learning_rate": 2.0263617105584107e-05,
909
+ "loss": 1.2544,
910
+ "step": 725
911
+ },
912
+ {
913
+ "epoch": 3.27,
914
+ "learning_rate": 2.3809496677893602e-05,
915
+ "loss": 1.1479,
916
+ "step": 730
917
+ },
918
+ {
919
+ "epoch": 3.3,
920
+ "learning_rate": 2.757752117394104e-05,
921
+ "loss": 1.1837,
922
+ "step": 735
923
+ },
924
+ {
925
+ "epoch": 3.32,
926
+ "learning_rate": 3.1549002535941176e-05,
927
+ "loss": 1.3242,
928
+ "step": 740
929
+ },
930
+ {
931
+ "epoch": 3.34,
932
+ "learning_rate": 3.570424363260841e-05,
933
+ "loss": 1.1755,
934
+ "step": 745
935
+ },
936
+ {
937
+ "epoch": 3.36,
938
+ "learning_rate": 4.002263594990341e-05,
939
+ "loss": 1.2206,
940
+ "step": 750
941
+ },
942
+ {
943
+ "epoch": 3.39,
944
+ "learning_rate": 4.448276180191424e-05,
945
+ "loss": 1.219,
946
+ "step": 755
947
+ },
948
+ {
949
+ "epoch": 3.41,
950
+ "learning_rate": 4.9062500554940324e-05,
951
+ "loss": 1.3741,
952
+ "step": 760
953
+ },
954
+ {
955
+ "epoch": 3.43,
956
+ "learning_rate": 5.373913833794688e-05,
957
+ "loss": 1.3206,
958
+ "step": 765
959
+ },
960
+ {
961
+ "epoch": 3.45,
962
+ "learning_rate": 5.848948069526606e-05,
963
+ "loss": 1.2842,
964
+ "step": 770
965
+ },
966
+ {
967
+ "epoch": 3.48,
968
+ "learning_rate": 6.32899676228247e-05,
969
+ "loss": 1.4653,
970
+ "step": 775
971
+ },
972
+ {
973
+ "epoch": 3.5,
974
+ "learning_rate": 6.811679041736369e-05,
975
+ "loss": 1.0829,
976
+ "step": 780
977
+ },
978
+ {
979
+ "epoch": 3.52,
980
+ "learning_rate": 7.294600975911534e-05,
981
+ "loss": 1.2317,
982
+ "step": 785
983
+ },
984
+ {
985
+ "epoch": 3.54,
986
+ "learning_rate": 7.775367444229201e-05,
987
+ "loss": 1.0955,
988
+ "step": 790
989
+ },
990
+ {
991
+ "epoch": 3.57,
992
+ "learning_rate": 8.251594016452641e-05,
993
+ "loss": 1.3158,
994
+ "step": 795
995
+ },
996
+ {
997
+ "epoch": 3.59,
998
+ "learning_rate": 8.720918778610773e-05,
999
+ "loss": 1.2593,
1000
+ "step": 800
1001
+ },
1002
+ {
1003
+ "epoch": 3.61,
1004
+ "learning_rate": 9.181014047249159e-05,
1005
+ "loss": 1.1672,
1006
+ "step": 805
1007
+ },
1008
+ {
1009
+ "epoch": 3.63,
1010
+ "learning_rate": 9.629597913909913e-05,
1011
+ "loss": 1.2407,
1012
+ "step": 810
1013
+ },
1014
+ {
1015
+ "epoch": 3.65,
1016
+ "learning_rate": 0.0001006444556258389,
1017
+ "loss": 1.2637,
1018
+ "step": 815
1019
+ },
1020
+ {
1021
+ "epoch": 3.68,
1022
+ "learning_rate": 0.00010483400304004644,
1023
+ "loss": 1.4097,
1024
+ "step": 820
1025
+ },
1026
+ {
1027
+ "epoch": 3.7,
1028
+ "learning_rate": 0.00010884384272058186,
1029
+ "loss": 1.3197,
1030
+ "step": 825
1031
+ },
1032
+ {
1033
+ "epoch": 3.72,
1034
+ "learning_rate": 0.00011265408729257823,
1035
+ "loss": 1.0697,
1036
+ "step": 830
1037
+ },
1038
+ {
1039
+ "epoch": 3.74,
1040
+ "learning_rate": 0.00011624583930172985,
1041
+ "loss": 1.079,
1042
+ "step": 835
1043
+ },
1044
+ {
1045
+ "epoch": 3.77,
1046
+ "learning_rate": 0.00011960128493892567,
1047
+ "loss": 1.2747,
1048
+ "step": 840
1049
+ },
1050
+ {
1051
+ "epoch": 3.79,
1052
+ "learning_rate": 0.0001227037823903907,
1053
+ "loss": 1.3586,
1054
+ "step": 845
1055
+ },
1056
+ {
1057
+ "epoch": 3.81,
1058
+ "learning_rate": 0.000125537944375147,
1059
+ "loss": 1.3766,
1060
+ "step": 850
1061
+ },
1062
+ {
1063
+ "epoch": 3.83,
1064
+ "learning_rate": 0.00012808971446044075,
1065
+ "loss": 1.2691,
1066
+ "step": 855
1067
+ },
1068
+ {
1069
+ "epoch": 3.86,
1070
+ "learning_rate": 0.00013034643677663536,
1071
+ "loss": 1.1551,
1072
+ "step": 860
1073
+ },
1074
+ {
1075
+ "epoch": 3.88,
1076
+ "learning_rate": 0.00013229691878581222,
1077
+ "loss": 1.3372,
1078
+ "step": 865
1079
+ },
1080
+ {
1081
+ "epoch": 3.9,
1082
+ "learning_rate": 0.00013393148679276723,
1083
+ "loss": 1.2608,
1084
+ "step": 870
1085
+ },
1086
+ {
1087
+ "epoch": 3.92,
1088
+ "learning_rate": 0.00013524203392308896,
1089
+ "loss": 1.1078,
1090
+ "step": 875
1091
+ },
1092
+ {
1093
+ "epoch": 3.95,
1094
+ "learning_rate": 0.00013622206033036527,
1095
+ "loss": 1.0532,
1096
+ "step": 880
1097
+ },
1098
+ {
1099
+ "epoch": 3.97,
1100
+ "learning_rate": 0.00013686670543310324,
1101
+ "loss": 1.1992,
1102
+ "step": 885
1103
+ },
1104
+ {
1105
+ "epoch": 3.99,
1106
+ "learning_rate": 0.00013717277202148013,
1107
+ "loss": 1.1418,
1108
+ "step": 890
1109
+ },
1110
+ {
1111
+ "epoch": 4.0,
1112
+ "eval_loss": 1.3141199350357056,
1113
+ "eval_runtime": 2.2159,
1114
+ "eval_samples_per_second": 142.153,
1115
+ "eval_steps_per_second": 18.051,
1116
+ "step": 892
1117
+ },
1118
+ {
1119
+ "epoch": 4.01,
1120
+ "learning_rate": 0.000137138742114364,
1121
+ "loss": 1.0312,
1122
+ "step": 895
1123
+ },
1124
+ {
1125
+ "epoch": 4.04,
1126
+ "learning_rate": 0.00013676478448795875,
1127
+ "loss": 1.0096,
1128
+ "step": 900
1129
+ },
1130
+ {
1131
+ "epoch": 4.06,
1132
+ "learning_rate": 0.00013605275383873428,
1133
+ "loss": 1.1906,
1134
+ "step": 905
1135
+ },
1136
+ {
1137
+ "epoch": 4.08,
1138
+ "learning_rate": 0.00013500618158479363,
1139
+ "loss": 1.1664,
1140
+ "step": 910
1141
+ },
1142
+ {
1143
+ "epoch": 4.1,
1144
+ "learning_rate": 0.00013363025835129812,
1145
+ "loss": 1.0422,
1146
+ "step": 915
1147
+ },
1148
+ {
1149
+ "epoch": 4.13,
1150
+ "learning_rate": 0.00013193180822681808,
1151
+ "loss": 1.0758,
1152
+ "step": 920
1153
+ },
1154
+ {
1155
+ "epoch": 4.15,
1156
+ "learning_rate": 0.0001299192549182867,
1157
+ "loss": 1.3852,
1158
+ "step": 925
1159
+ },
1160
+ {
1161
+ "epoch": 4.17,
1162
+ "learning_rate": 0.0001276025799724176,
1163
+ "loss": 1.2089,
1164
+ "step": 930
1165
+ },
1166
+ {
1167
+ "epoch": 4.19,
1168
+ "learning_rate": 0.00012499327327079299,
1169
+ "loss": 1.1252,
1170
+ "step": 935
1171
+ },
1172
+ {
1173
+ "epoch": 4.22,
1174
+ "learning_rate": 0.00012210427604414836,
1175
+ "loss": 1.2521,
1176
+ "step": 940
1177
+ },
1178
+ {
1179
+ "epoch": 4.24,
1180
+ "learning_rate": 0.00011894991668848237,
1181
+ "loss": 1.2428,
1182
+ "step": 945
1183
+ },
1184
+ {
1185
+ "epoch": 4.26,
1186
+ "learning_rate": 0.00011554583970132328,
1187
+ "loss": 1.1082,
1188
+ "step": 950
1189
+ },
1190
+ {
1191
+ "epoch": 4.28,
1192
+ "learning_rate": 0.00011190892809059987,
1193
+ "loss": 1.0971,
1194
+ "step": 955
1195
+ },
1196
+ {
1197
+ "epoch": 4.3,
1198
+ "learning_rate": 0.00010805721964094202,
1199
+ "loss": 1.0657,
1200
+ "step": 960
1201
+ },
1202
+ {
1203
+ "epoch": 4.33,
1204
+ "learning_rate": 0.00010400981745270263,
1205
+ "loss": 1.0439,
1206
+ "step": 965
1207
+ },
1208
+ {
1209
+ "epoch": 4.35,
1210
+ "learning_rate": 9.978679519739164e-05,
1211
+ "loss": 1.1785,
1212
+ "step": 970
1213
+ },
1214
+ {
1215
+ "epoch": 4.37,
1216
+ "learning_rate": 9.540909755942318e-05,
1217
+ "loss": 1.1708,
1218
+ "step": 975
1219
+ },
1220
+ {
1221
+ "epoch": 4.39,
1222
+ "learning_rate": 9.089843635795119e-05,
1223
+ "loss": 1.0215,
1224
+ "step": 980
1225
+ },
1226
+ {
1227
+ "epoch": 4.42,
1228
+ "learning_rate": 8.627718286398852e-05,
1229
+ "loss": 1.1393,
1230
+ "step": 985
1231
+ },
1232
+ {
1233
+ "epoch": 4.44,
1234
+ "learning_rate": 8.156825684687972e-05,
1235
+ "loss": 1.0526,
1236
+ "step": 990
1237
+ },
1238
+ {
1239
+ "epoch": 4.46,
1240
+ "learning_rate": 7.679501290041993e-05,
1241
+ "loss": 0.9887,
1242
+ "step": 995
1243
+ },
1244
+ {
1245
+ "epoch": 4.48,
1246
+ "learning_rate": 7.198112461239741e-05,
1247
+ "loss": 0.8743,
1248
+ "step": 1000
1249
+ },
1250
+ {
1251
+ "epoch": 4.51,
1252
+ "learning_rate": 6.715046715204013e-05,
1253
+ "loss": 0.9096,
1254
+ "step": 1005
1255
+ },
1256
+ {
1257
+ "epoch": 4.53,
1258
+ "learning_rate": 6.232699885769075e-05,
1259
+ "loss": 1.2684,
1260
+ "step": 1010
1261
+ },
1262
+ {
1263
+ "epoch": 4.55,
1264
+ "learning_rate": 5.753464241199265e-05,
1265
+ "loss": 1.0412,
1266
+ "step": 1015
1267
+ },
1268
+ {
1269
+ "epoch": 4.57,
1270
+ "learning_rate": 5.279716619391422e-05,
1271
+ "loss": 0.9954,
1272
+ "step": 1020
1273
+ },
1274
+ {
1275
+ "epoch": 4.6,
1276
+ "learning_rate": 4.813806639606615e-05,
1277
+ "loss": 0.8753,
1278
+ "step": 1025
1279
+ },
1280
+ {
1281
+ "epoch": 4.62,
1282
+ "learning_rate": 4.358045049196426e-05,
1283
+ "loss": 0.8638,
1284
+ "step": 1030
1285
+ },
1286
+ {
1287
+ "epoch": 4.64,
1288
+ "learning_rate": 3.9146922631201556e-05,
1289
+ "loss": 1.0574,
1290
+ "step": 1035
1291
+ },
1292
+ {
1293
+ "epoch": 4.66,
1294
+ "learning_rate": 3.485947153092735e-05,
1295
+ "loss": 0.8752,
1296
+ "step": 1040
1297
+ },
1298
+ {
1299
+ "epoch": 4.69,
1300
+ "learning_rate": 3.073936141965131e-05,
1301
+ "loss": 1.0359,
1302
+ "step": 1045
1303
+ },
1304
+ {
1305
+ "epoch": 4.71,
1306
+ "learning_rate": 2.680702657425298e-05,
1307
+ "loss": 1.0701,
1308
+ "step": 1050
1309
+ },
1310
+ {
1311
+ "epoch": 4.73,
1312
+ "learning_rate": 2.3081969973255665e-05,
1313
+ "loss": 0.9777,
1314
+ "step": 1055
1315
+ },
1316
+ {
1317
+ "epoch": 4.75,
1318
+ "learning_rate": 1.9582666569008116e-05,
1319
+ "loss": 0.9861,
1320
+ "step": 1060
1321
+ },
1322
+ {
1323
+ "epoch": 4.78,
1324
+ "learning_rate": 1.632647165850912e-05,
1325
+ "loss": 1.0365,
1326
+ "step": 1065
1327
+ },
1328
+ {
1329
+ "epoch": 4.8,
1330
+ "learning_rate": 1.3329534807322873e-05,
1331
+ "loss": 0.9034,
1332
+ "step": 1070
1333
+ },
1334
+ {
1335
+ "epoch": 4.82,
1336
+ "learning_rate": 1.0606719753493918e-05,
1337
+ "loss": 1.0476,
1338
+ "step": 1075
1339
+ },
1340
+ {
1341
+ "epoch": 4.84,
1342
+ "learning_rate": 8.171530688706377e-06,
1343
+ "loss": 1.0288,
1344
+ "step": 1080
1345
+ },
1346
+ {
1347
+ "epoch": 4.87,
1348
+ "learning_rate": 6.036045282308682e-06,
1349
+ "loss": 1.1233,
1350
+ "step": 1085
1351
+ },
1352
+ {
1353
+ "epoch": 4.89,
1354
+ "learning_rate": 4.210854780381334e-06,
1355
+ "loss": 0.9158,
1356
+ "step": 1090
1357
+ },
1358
+ {
1359
+ "epoch": 4.91,
1360
+ "learning_rate": 2.705011476932832e-06,
1361
+ "loss": 1.0037,
1362
+ "step": 1095
1363
+ },
1364
+ {
1365
+ "epoch": 4.93,
1366
+ "learning_rate": 1.5259838177494246e-06,
1367
+ "loss": 1.279,
1368
+ "step": 1100
1369
+ },
1370
+ {
1371
+ "epoch": 4.96,
1372
+ "learning_rate": 6.796193595677371e-07,
1373
+ "loss": 1.0122,
1374
+ "step": 1105
1375
+ },
1376
+ {
1377
+ "epoch": 4.98,
1378
+ "learning_rate": 1.7011576827836676e-07,
1379
+ "loss": 1.058,
1380
+ "step": 1110
1381
+ },
1382
+ {
1383
+ "epoch": 5.0,
1384
+ "learning_rate": 0.0,
1385
+ "loss": 1.0917,
1386
+ "step": 1115
1387
+ },
1388
+ {
1389
+ "epoch": 5.0,
1390
+ "eval_loss": 1.2071579694747925,
1391
+ "eval_runtime": 2.221,
1392
+ "eval_samples_per_second": 141.829,
1393
+ "eval_steps_per_second": 18.01,
1394
+ "step": 1115
1395
+ },
1396
+ {
1397
+ "epoch": 5.02,
1398
+ "learning_rate": 1.7011576827835916e-07,
1399
+ "loss": 0.9624,
1400
+ "step": 1120
1401
+ },
1402
+ {
1403
+ "epoch": 5.04,
1404
+ "learning_rate": 6.796193595677219e-07,
1405
+ "loss": 1.0903,
1406
+ "step": 1125
1407
+ },
1408
+ {
1409
+ "epoch": 5.07,
1410
+ "learning_rate": 1.5259838177494094e-06,
1411
+ "loss": 0.8195,
1412
+ "step": 1130
1413
+ },
1414
+ {
1415
+ "epoch": 5.09,
1416
+ "learning_rate": 2.705011476932809e-06,
1417
+ "loss": 0.9007,
1418
+ "step": 1135
1419
+ },
1420
+ {
1421
+ "epoch": 5.11,
1422
+ "learning_rate": 4.210854780381303e-06,
1423
+ "loss": 0.6803,
1424
+ "step": 1140
1425
+ },
1426
+ {
1427
+ "epoch": 5.13,
1428
+ "learning_rate": 6.036045282308651e-06,
1429
+ "loss": 0.9312,
1430
+ "step": 1145
1431
+ },
1432
+ {
1433
+ "epoch": 5.16,
1434
+ "learning_rate": 8.171530688706338e-06,
1435
+ "loss": 1.0329,
1436
+ "step": 1150
1437
+ },
1438
+ {
1439
+ "epoch": 5.18,
1440
+ "learning_rate": 1.0606719753493872e-05,
1441
+ "loss": 0.9101,
1442
+ "step": 1155
1443
+ },
1444
+ {
1445
+ "epoch": 5.2,
1446
+ "learning_rate": 1.3329534807322828e-05,
1447
+ "loss": 0.8229,
1448
+ "step": 1160
1449
+ },
1450
+ {
1451
+ "epoch": 5.22,
1452
+ "learning_rate": 1.6326471658509066e-05,
1453
+ "loss": 1.0693,
1454
+ "step": 1165
1455
+ },
1456
+ {
1457
+ "epoch": 5.25,
1458
+ "learning_rate": 1.9582666569008055e-05,
1459
+ "loss": 0.7779,
1460
+ "step": 1170
1461
+ },
1462
+ {
1463
+ "epoch": 5.27,
1464
+ "learning_rate": 2.3081969973255604e-05,
1465
+ "loss": 0.7962,
1466
+ "step": 1175
1467
+ },
1468
+ {
1469
+ "epoch": 5.29,
1470
+ "learning_rate": 2.680702657425292e-05,
1471
+ "loss": 0.9127,
1472
+ "step": 1180
1473
+ },
1474
+ {
1475
+ "epoch": 5.31,
1476
+ "learning_rate": 3.073936141965114e-05,
1477
+ "loss": 0.8227,
1478
+ "step": 1185
1479
+ },
1480
+ {
1481
+ "epoch": 5.34,
1482
+ "learning_rate": 3.4859471530927266e-05,
1483
+ "loss": 0.9535,
1484
+ "step": 1190
1485
+ },
1486
+ {
1487
+ "epoch": 5.36,
1488
+ "learning_rate": 3.914692263120148e-05,
1489
+ "loss": 0.7736,
1490
+ "step": 1195
1491
+ },
1492
+ {
1493
+ "epoch": 5.38,
1494
+ "learning_rate": 4.358045049196419e-05,
1495
+ "loss": 0.9111,
1496
+ "step": 1200
1497
+ },
1498
+ {
1499
+ "epoch": 5.4,
1500
+ "learning_rate": 4.813806639606595e-05,
1501
+ "loss": 0.9058,
1502
+ "step": 1205
1503
+ },
1504
+ {
1505
+ "epoch": 5.43,
1506
+ "learning_rate": 5.279716619391414e-05,
1507
+ "loss": 0.8259,
1508
+ "step": 1210
1509
+ },
1510
+ {
1511
+ "epoch": 5.45,
1512
+ "learning_rate": 5.753464241199256e-05,
1513
+ "loss": 0.9812,
1514
+ "step": 1215
1515
+ },
1516
+ {
1517
+ "epoch": 5.47,
1518
+ "learning_rate": 6.232699885769054e-05,
1519
+ "loss": 1.0264,
1520
+ "step": 1220
1521
+ },
1522
+ {
1523
+ "epoch": 5.49,
1524
+ "learning_rate": 6.715046715203992e-05,
1525
+ "loss": 0.8185,
1526
+ "step": 1225
1527
+ },
1528
+ {
1529
+ "epoch": 5.52,
1530
+ "learning_rate": 7.198112461239733e-05,
1531
+ "loss": 0.8403,
1532
+ "step": 1230
1533
+ },
1534
+ {
1535
+ "epoch": 5.54,
1536
+ "learning_rate": 7.679501290041973e-05,
1537
+ "loss": 0.914,
1538
+ "step": 1235
1539
+ },
1540
+ {
1541
+ "epoch": 5.56,
1542
+ "learning_rate": 8.15682568468795e-05,
1543
+ "loss": 0.9836,
1544
+ "step": 1240
1545
+ },
1546
+ {
1547
+ "epoch": 5.58,
1548
+ "learning_rate": 8.627718286398833e-05,
1549
+ "loss": 0.9734,
1550
+ "step": 1245
1551
+ },
1552
+ {
1553
+ "epoch": 5.61,
1554
+ "learning_rate": 9.089843635795102e-05,
1555
+ "loss": 0.8468,
1556
+ "step": 1250
1557
+ },
1558
+ {
1559
+ "epoch": 5.63,
1560
+ "learning_rate": 9.540909755942299e-05,
1561
+ "loss": 0.835,
1562
+ "step": 1255
1563
+ },
1564
+ {
1565
+ "epoch": 5.65,
1566
+ "learning_rate": 9.978679519739145e-05,
1567
+ "loss": 1.0896,
1568
+ "step": 1260
1569
+ },
1570
+ {
1571
+ "epoch": 5.67,
1572
+ "learning_rate": 0.00010400981745270244,
1573
+ "loss": 0.7718,
1574
+ "step": 1265
1575
+ },
1576
+ {
1577
+ "epoch": 5.7,
1578
+ "learning_rate": 0.00010805721964094184,
1579
+ "loss": 0.8329,
1580
+ "step": 1270
1581
+ },
1582
+ {
1583
+ "epoch": 5.72,
1584
+ "learning_rate": 0.0001119089280905997,
1585
+ "loss": 0.9622,
1586
+ "step": 1275
1587
+ },
1588
+ {
1589
+ "epoch": 5.74,
1590
+ "learning_rate": 0.0001155458397013233,
1591
+ "loss": 0.8208,
1592
+ "step": 1280
1593
+ },
1594
+ {
1595
+ "epoch": 5.76,
1596
+ "learning_rate": 0.00011894991668848222,
1597
+ "loss": 0.9137,
1598
+ "step": 1285
1599
+ },
1600
+ {
1601
+ "epoch": 5.78,
1602
+ "learning_rate": 0.00012210427604414823,
1603
+ "loss": 1.0149,
1604
+ "step": 1290
1605
+ },
1606
+ {
1607
+ "epoch": 5.81,
1608
+ "learning_rate": 0.000124993273270793,
1609
+ "loss": 0.9856,
1610
+ "step": 1295
1611
+ },
1612
+ {
1613
+ "epoch": 5.83,
1614
+ "learning_rate": 0.0001276025799724176,
1615
+ "loss": 0.9686,
1616
+ "step": 1300
1617
+ },
1618
+ {
1619
+ "epoch": 5.85,
1620
+ "learning_rate": 0.0001299192549182866,
1621
+ "loss": 0.9409,
1622
+ "step": 1305
1623
+ },
1624
+ {
1625
+ "epoch": 5.87,
1626
+ "learning_rate": 0.00013193180822681808,
1627
+ "loss": 1.214,
1628
+ "step": 1310
1629
+ },
1630
+ {
1631
+ "epoch": 5.9,
1632
+ "learning_rate": 0.00013363025835129815,
1633
+ "loss": 0.8486,
1634
+ "step": 1315
1635
+ },
1636
+ {
1637
+ "epoch": 5.92,
1638
+ "learning_rate": 0.00013500618158479366,
1639
+ "loss": 1.0083,
1640
+ "step": 1320
1641
+ },
1642
+ {
1643
+ "epoch": 5.94,
1644
+ "learning_rate": 0.00013605275383873428,
1645
+ "loss": 0.9284,
1646
+ "step": 1325
1647
+ },
1648
+ {
1649
+ "epoch": 5.96,
1650
+ "learning_rate": 0.00013676478448795875,
1651
+ "loss": 1.0467,
1652
+ "step": 1330
1653
+ },
1654
+ {
1655
+ "epoch": 5.99,
1656
+ "learning_rate": 0.00013713874211436402,
1657
+ "loss": 0.9702,
1658
+ "step": 1335
1659
+ },
1660
+ {
1661
+ "epoch": 6.0,
1662
+ "eval_loss": 1.2323447465896606,
1663
+ "eval_runtime": 2.2212,
1664
+ "eval_samples_per_second": 141.817,
1665
+ "eval_steps_per_second": 18.008,
1666
+ "step": 1338
1667
+ },
1668
+ {
1669
+ "epoch": 6.01,
1670
+ "learning_rate": 0.00013717277202148013,
1671
+ "loss": 0.9361,
1672
+ "step": 1340
1673
+ },
1674
+ {
1675
+ "epoch": 6.03,
1676
+ "learning_rate": 0.00013686670543310324,
1677
+ "loss": 0.951,
1678
+ "step": 1345
1679
+ },
1680
+ {
1681
+ "epoch": 6.05,
1682
+ "learning_rate": 0.00013622206033036527,
1683
+ "loss": 0.7387,
1684
+ "step": 1350
1685
+ },
1686
+ {
1687
+ "epoch": 6.08,
1688
+ "learning_rate": 0.0001352420339230889,
1689
+ "loss": 0.8237,
1690
+ "step": 1355
1691
+ },
1692
+ {
1693
+ "epoch": 6.1,
1694
+ "learning_rate": 0.0001339314867927672,
1695
+ "loss": 0.8428,
1696
+ "step": 1360
1697
+ },
1698
+ {
1699
+ "epoch": 6.12,
1700
+ "learning_rate": 0.00013229691878581222,
1701
+ "loss": 0.8352,
1702
+ "step": 1365
1703
+ },
1704
+ {
1705
+ "epoch": 6.14,
1706
+ "learning_rate": 0.00013034643677663527,
1707
+ "loss": 0.8816,
1708
+ "step": 1370
1709
+ },
1710
+ {
1711
+ "epoch": 6.17,
1712
+ "learning_rate": 0.00012808971446044072,
1713
+ "loss": 0.781,
1714
+ "step": 1375
1715
+ },
1716
+ {
1717
+ "epoch": 6.19,
1718
+ "learning_rate": 0.00012553794437514699,
1719
+ "loss": 0.7146,
1720
+ "step": 1380
1721
+ },
1722
+ {
1723
+ "epoch": 6.21,
1724
+ "learning_rate": 0.0001227037823903906,
1725
+ "loss": 0.9361,
1726
+ "step": 1385
1727
+ },
1728
+ {
1729
+ "epoch": 6.23,
1730
+ "learning_rate": 0.00011960128493892572,
1731
+ "loss": 0.8889,
1732
+ "step": 1390
1733
+ },
1734
+ {
1735
+ "epoch": 6.26,
1736
+ "learning_rate": 0.00011624583930172982,
1737
+ "loss": 0.9329,
1738
+ "step": 1395
1739
+ },
1740
+ {
1741
+ "epoch": 6.28,
1742
+ "learning_rate": 0.0001126540872925782,
1743
+ "loss": 0.7853,
1744
+ "step": 1400
1745
+ },
1746
+ {
1747
+ "epoch": 6.3,
1748
+ "learning_rate": 0.00010884384272058193,
1749
+ "loss": 0.7188,
1750
+ "step": 1405
1751
+ },
1752
+ {
1753
+ "epoch": 6.32,
1754
+ "learning_rate": 0.00010483400304004653,
1755
+ "loss": 0.7662,
1756
+ "step": 1410
1757
+ },
1758
+ {
1759
+ "epoch": 6.35,
1760
+ "learning_rate": 0.00010064445562583886,
1761
+ "loss": 0.7705,
1762
+ "step": 1415
1763
+ },
1764
+ {
1765
+ "epoch": 6.37,
1766
+ "learning_rate": 9.629597913909932e-05,
1767
+ "loss": 0.9695,
1768
+ "step": 1420
1769
+ },
1770
+ {
1771
+ "epoch": 6.39,
1772
+ "learning_rate": 9.181014047249165e-05,
1773
+ "loss": 0.6245,
1774
+ "step": 1425
1775
+ },
1776
+ {
1777
+ "epoch": 6.41,
1778
+ "learning_rate": 8.72091877861077e-05,
1779
+ "loss": 0.9465,
1780
+ "step": 1430
1781
+ },
1782
+ {
1783
+ "epoch": 6.43,
1784
+ "learning_rate": 8.25159401645266e-05,
1785
+ "loss": 0.8094,
1786
+ "step": 1435
1787
+ },
1788
+ {
1789
+ "epoch": 6.46,
1790
+ "learning_rate": 7.775367444229211e-05,
1791
+ "loss": 0.8367,
1792
+ "step": 1440
1793
+ },
1794
+ {
1795
+ "epoch": 6.48,
1796
+ "learning_rate": 7.29460097591153e-05,
1797
+ "loss": 0.5786,
1798
+ "step": 1445
1799
+ },
1800
+ {
1801
+ "epoch": 6.5,
1802
+ "learning_rate": 6.81167904173639e-05,
1803
+ "loss": 0.8389,
1804
+ "step": 1450
1805
+ },
1806
+ {
1807
+ "epoch": 6.52,
1808
+ "learning_rate": 6.328996762282478e-05,
1809
+ "loss": 0.9198,
1810
+ "step": 1455
1811
+ },
1812
+ {
1813
+ "epoch": 6.55,
1814
+ "learning_rate": 5.848948069526602e-05,
1815
+ "loss": 0.8697,
1816
+ "step": 1460
1817
+ },
1818
+ {
1819
+ "epoch": 6.57,
1820
+ "learning_rate": 5.3739138337947083e-05,
1821
+ "loss": 0.805,
1822
+ "step": 1465
1823
+ },
1824
+ {
1825
+ "epoch": 6.59,
1826
+ "learning_rate": 4.90625005549404e-05,
1827
+ "loss": 0.7912,
1828
+ "step": 1470
1829
+ },
1830
+ {
1831
+ "epoch": 6.61,
1832
+ "learning_rate": 4.448276180191432e-05,
1833
+ "loss": 0.7832,
1834
+ "step": 1475
1835
+ },
1836
+ {
1837
+ "epoch": 6.64,
1838
+ "learning_rate": 4.0022635949903595e-05,
1839
+ "loss": 0.7555,
1840
+ "step": 1480
1841
+ },
1842
+ {
1843
+ "epoch": 6.66,
1844
+ "learning_rate": 3.570424363260848e-05,
1845
+ "loss": 0.88,
1846
+ "step": 1485
1847
+ },
1848
+ {
1849
+ "epoch": 6.68,
1850
+ "learning_rate": 3.1549002535941244e-05,
1851
+ "loss": 0.7924,
1852
+ "step": 1490
1853
+ },
1854
+ {
1855
+ "epoch": 6.7,
1856
+ "learning_rate": 2.75775211739412e-05,
1857
+ "loss": 0.7299,
1858
+ "step": 1495
1859
+ },
1860
+ {
1861
+ "epoch": 6.73,
1862
+ "learning_rate": 2.3809496677893663e-05,
1863
+ "loss": 0.9469,
1864
+ "step": 1500
1865
+ },
1866
+ {
1867
+ "epoch": 6.75,
1868
+ "learning_rate": 2.0263617105584168e-05,
1869
+ "loss": 0.6995,
1870
+ "step": 1505
1871
+ },
1872
+ {
1873
+ "epoch": 6.77,
1874
+ "learning_rate": 1.6957468755195542e-05,
1875
+ "loss": 0.8889,
1876
+ "step": 1510
1877
+ },
1878
+ {
1879
+ "epoch": 6.79,
1880
+ "learning_rate": 1.3907448943539954e-05,
1881
+ "loss": 0.8217,
1882
+ "step": 1515
1883
+ },
1884
+ {
1885
+ "epoch": 6.82,
1886
+ "learning_rate": 1.1128684681215955e-05,
1887
+ "loss": 0.7166,
1888
+ "step": 1520
1889
+ },
1890
+ {
1891
+ "epoch": 6.84,
1892
+ "learning_rate": 8.63495764802965e-06,
1893
+ "loss": 0.873,
1894
+ "step": 1525
1895
+ },
1896
+ {
1897
+ "epoch": 6.86,
1898
+ "learning_rate": 6.4386358407778465e-06,
1899
+ "loss": 0.906,
1900
+ "step": 1530
1901
+ },
1902
+ {
1903
+ "epoch": 6.88,
1904
+ "learning_rate": 4.550612232394066e-06,
1905
+ "loss": 0.9969,
1906
+ "step": 1535
1907
+ },
1908
+ {
1909
+ "epoch": 6.91,
1910
+ "learning_rate": 2.9802507466879855e-06,
1911
+ "loss": 0.6348,
1912
+ "step": 1540
1913
+ },
1914
+ {
1915
+ "epoch": 6.93,
1916
+ "learning_rate": 1.735339816622208e-06,
1917
+ "loss": 0.7365,
1918
+ "step": 1545
1919
+ },
1920
+ {
1921
+ "epoch": 6.95,
1922
+ "learning_rate": 8.220537564620966e-07,
1923
+ "loss": 0.9509,
1924
+ "step": 1550
1925
+ },
1926
+ {
1927
+ "epoch": 6.97,
1928
+ "learning_rate": 2.4492213937922245e-07,
1929
+ "loss": 0.8438,
1930
+ "step": 1555
1931
+ },
1932
+ {
1933
+ "epoch": 7.0,
1934
+ "learning_rate": 6.807332383425125e-09,
1935
+ "loss": 0.9895,
1936
+ "step": 1560
1937
+ },
1938
+ {
1939
+ "epoch": 7.0,
1940
+ "eval_loss": 1.1562458276748657,
1941
+ "eval_runtime": 2.2177,
1942
+ "eval_samples_per_second": 142.037,
1943
+ "eval_steps_per_second": 18.036,
1944
+ "step": 1561
1945
  }
1946
  ],
1947
+ "max_steps": 1561,
1948
+ "num_train_epochs": 7,
1949
+ "total_flos": 1630331633664000.0,
1950
  "trial_name": null,
1951
  "trial_params": null
1952
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eaf00d98777c40d1ee6ae43b9a54114156e44aec0020e9e8e22c522be0398be6
3
- size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a45c13f86e007d237bf70ffcf99d8136f61ccf6dd43efe67c5d795076fd2fde
3
+ size 3311