mohammadmahdinouri commited on
Commit
12fbc04
·
verified ·
1 Parent(s): 8aa6054

Training in progress, step 19000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b00afd84b6c9ce17eaf6cde875a1462d2a5f0a7c0b9c73a9b93dfa70356a2e2
3
  size 715030586
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b1b3bb212bb8e7a89dfabaae6120a59b5faf8f122cf7ad539861771d8cb89a
3
  size 715030586
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ebce57f62b5c08e94d3ef4d4c19d6f624921ff13378d5f419a1a0fc63ae8de2
3
  size 1032262338
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:229e29c1490a5a19b0d4fabe6ed475185030744313f815cbb86ca74d5cd2c449
3
  size 1032262338
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13c1b31558f9530223d30967d940c908110b66ae87767dc8b41640c0ec2ab3ad
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dde6c449de2177e60a94a9900fe6d5a14850cbd574c0318aea1700129a48c14
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31e1f3d55bb567df3a2ebf344a0ee08608b18736ddff2de100218656482b16ab
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea9cf41a0d1b98e0760cde7b6c59ff69bea027f4012ae8adfed82ffa854f9831
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7068584adf4719cad732133ffdff00b498545ab4f7b6d887d675a74b59641e2
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:accf3b70270e61a071af09840966e9ef1fc65fa1b993b5947a7d43d8b578c1b2
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f43ad3e51655951e2a9c021cf9bdd46d25eb6df7a162e3fc18fe50a401173803
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5275cc5e18e1b6590412766faef669b7b593c775c1ba9bd63e9afe6463f5d8b8
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b73090e5ff4d77e40aae33305c58d2deda13e4f4510f1c076acf40a9f8a97bef
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.035099521769015894,
6
  "eval_steps": 500,
7
- "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12608,6 +12608,706 @@
12608
  "learning_rate": 0.0004943110480558603,
12609
  "loss": 17.5819,
12610
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12611
  }
12612
  ],
12613
  "logging_steps": 10,
@@ -12627,7 +13327,7 @@
12627
  "attributes": {}
12628
  }
12629
  },
12630
- "total_flos": 3.889326067389196e+19,
12631
  "train_batch_size": 48,
12632
  "trial_name": null,
12633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.03704949520062789,
6
  "eval_steps": 500,
7
+ "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12608
  "learning_rate": 0.0004943110480558603,
12609
  "loss": 17.5819,
12610
  "step": 18000
12611
+ },
12612
+ {
12613
+ "epoch": 0.035119021503332015,
12614
+ "grad_norm": 9.4375,
12615
+ "learning_rate": 0.0004943077970404056,
12616
+ "loss": 17.6622,
12617
+ "step": 18010
12618
+ },
12619
+ {
12620
+ "epoch": 0.035138521237648136,
12621
+ "grad_norm": 8.25,
12622
+ "learning_rate": 0.0004943045460249509,
12623
+ "loss": 17.7021,
12624
+ "step": 18020
12625
+ },
12626
+ {
12627
+ "epoch": 0.03515802097196426,
12628
+ "grad_norm": 7.71875,
12629
+ "learning_rate": 0.0004943012950094963,
12630
+ "loss": 17.5496,
12631
+ "step": 18030
12632
+ },
12633
+ {
12634
+ "epoch": 0.03517752070628038,
12635
+ "grad_norm": 10.375,
12636
+ "learning_rate": 0.0004942980439940416,
12637
+ "loss": 17.6953,
12638
+ "step": 18040
12639
+ },
12640
+ {
12641
+ "epoch": 0.0351970204405965,
12642
+ "grad_norm": 8.625,
12643
+ "learning_rate": 0.0004942947929785869,
12644
+ "loss": 17.5717,
12645
+ "step": 18050
12646
+ },
12647
+ {
12648
+ "epoch": 0.03521652017491262,
12649
+ "grad_norm": 8.25,
12650
+ "learning_rate": 0.0004942915419631321,
12651
+ "loss": 17.5987,
12652
+ "step": 18060
12653
+ },
12654
+ {
12655
+ "epoch": 0.035236019909228734,
12656
+ "grad_norm": 8.0625,
12657
+ "learning_rate": 0.0004942882909476775,
12658
+ "loss": 17.4626,
12659
+ "step": 18070
12660
+ },
12661
+ {
12662
+ "epoch": 0.035255519643544855,
12663
+ "grad_norm": 8.6875,
12664
+ "learning_rate": 0.0004942850399322228,
12665
+ "loss": 17.6371,
12666
+ "step": 18080
12667
+ },
12668
+ {
12669
+ "epoch": 0.035275019377860976,
12670
+ "grad_norm": 8.8125,
12671
+ "learning_rate": 0.0004942817889167681,
12672
+ "loss": 17.6286,
12673
+ "step": 18090
12674
+ },
12675
+ {
12676
+ "epoch": 0.0352945191121771,
12677
+ "grad_norm": 7.375,
12678
+ "learning_rate": 0.0004942785379013135,
12679
+ "loss": 17.4806,
12680
+ "step": 18100
12681
+ },
12682
+ {
12683
+ "epoch": 0.03531401884649322,
12684
+ "grad_norm": 8.8125,
12685
+ "learning_rate": 0.0004942752868858588,
12686
+ "loss": 17.5438,
12687
+ "step": 18110
12688
+ },
12689
+ {
12690
+ "epoch": 0.03533351858080934,
12691
+ "grad_norm": 8.75,
12692
+ "learning_rate": 0.0004942720358704041,
12693
+ "loss": 17.6634,
12694
+ "step": 18120
12695
+ },
12696
+ {
12697
+ "epoch": 0.03535301831512545,
12698
+ "grad_norm": 8.1875,
12699
+ "learning_rate": 0.0004942687848549494,
12700
+ "loss": 17.5197,
12701
+ "step": 18130
12702
+ },
12703
+ {
12704
+ "epoch": 0.035372518049441574,
12705
+ "grad_norm": 8.25,
12706
+ "learning_rate": 0.0004942655338394948,
12707
+ "loss": 17.6001,
12708
+ "step": 18140
12709
+ },
12710
+ {
12711
+ "epoch": 0.035392017783757695,
12712
+ "grad_norm": 8.4375,
12713
+ "learning_rate": 0.0004942622828240401,
12714
+ "loss": 17.6117,
12715
+ "step": 18150
12716
+ },
12717
+ {
12718
+ "epoch": 0.035411517518073816,
12719
+ "grad_norm": 8.0625,
12720
+ "learning_rate": 0.0004942590318085854,
12721
+ "loss": 17.4802,
12722
+ "step": 18160
12723
+ },
12724
+ {
12725
+ "epoch": 0.03543101725238994,
12726
+ "grad_norm": 8.375,
12727
+ "learning_rate": 0.0004942557807931308,
12728
+ "loss": 17.6808,
12729
+ "step": 18170
12730
+ },
12731
+ {
12732
+ "epoch": 0.03545051698670606,
12733
+ "grad_norm": 8.5,
12734
+ "learning_rate": 0.0004942525297776761,
12735
+ "loss": 17.5905,
12736
+ "step": 18180
12737
+ },
12738
+ {
12739
+ "epoch": 0.03547001672102218,
12740
+ "grad_norm": 11.3125,
12741
+ "learning_rate": 0.0004942492787622214,
12742
+ "loss": 17.5809,
12743
+ "step": 18190
12744
+ },
12745
+ {
12746
+ "epoch": 0.035489516455338294,
12747
+ "grad_norm": 8.4375,
12748
+ "learning_rate": 0.0004942460277467667,
12749
+ "loss": 17.7091,
12750
+ "step": 18200
12751
+ },
12752
+ {
12753
+ "epoch": 0.035509016189654415,
12754
+ "grad_norm": 7.9375,
12755
+ "learning_rate": 0.0004942427767313121,
12756
+ "loss": 17.5444,
12757
+ "step": 18210
12758
+ },
12759
+ {
12760
+ "epoch": 0.035528515923970536,
12761
+ "grad_norm": 8.375,
12762
+ "learning_rate": 0.0004942395257158573,
12763
+ "loss": 17.5533,
12764
+ "step": 18220
12765
+ },
12766
+ {
12767
+ "epoch": 0.03554801565828666,
12768
+ "grad_norm": 7.75,
12769
+ "learning_rate": 0.0004942362747004026,
12770
+ "loss": 17.6227,
12771
+ "step": 18230
12772
+ },
12773
+ {
12774
+ "epoch": 0.03556751539260278,
12775
+ "grad_norm": 7.6875,
12776
+ "learning_rate": 0.000494233023684948,
12777
+ "loss": 17.5303,
12778
+ "step": 18240
12779
+ },
12780
+ {
12781
+ "epoch": 0.0355870151269189,
12782
+ "grad_norm": 7.84375,
12783
+ "learning_rate": 0.0004942297726694933,
12784
+ "loss": 17.5429,
12785
+ "step": 18250
12786
+ },
12787
+ {
12788
+ "epoch": 0.03560651486123501,
12789
+ "grad_norm": 9.8125,
12790
+ "learning_rate": 0.0004942265216540386,
12791
+ "loss": 17.6151,
12792
+ "step": 18260
12793
+ },
12794
+ {
12795
+ "epoch": 0.035626014595551134,
12796
+ "grad_norm": 10.5,
12797
+ "learning_rate": 0.0004942232706385839,
12798
+ "loss": 17.42,
12799
+ "step": 18270
12800
+ },
12801
+ {
12802
+ "epoch": 0.035645514329867255,
12803
+ "grad_norm": 9.1875,
12804
+ "learning_rate": 0.0004942200196231293,
12805
+ "loss": 17.436,
12806
+ "step": 18280
12807
+ },
12808
+ {
12809
+ "epoch": 0.035665014064183376,
12810
+ "grad_norm": 8.8125,
12811
+ "learning_rate": 0.0004942167686076746,
12812
+ "loss": 17.5189,
12813
+ "step": 18290
12814
+ },
12815
+ {
12816
+ "epoch": 0.0356845137984995,
12817
+ "grad_norm": 8.6875,
12818
+ "learning_rate": 0.0004942135175922199,
12819
+ "loss": 17.5484,
12820
+ "step": 18300
12821
+ },
12822
+ {
12823
+ "epoch": 0.03570401353281562,
12824
+ "grad_norm": 9.0,
12825
+ "learning_rate": 0.0004942102665767652,
12826
+ "loss": 17.6905,
12827
+ "step": 18310
12828
+ },
12829
+ {
12830
+ "epoch": 0.03572351326713174,
12831
+ "grad_norm": 8.0625,
12832
+ "learning_rate": 0.0004942070155613106,
12833
+ "loss": 17.677,
12834
+ "step": 18320
12835
+ },
12836
+ {
12837
+ "epoch": 0.03574301300144785,
12838
+ "grad_norm": 7.4375,
12839
+ "learning_rate": 0.0004942037645458559,
12840
+ "loss": 17.5109,
12841
+ "step": 18330
12842
+ },
12843
+ {
12844
+ "epoch": 0.035762512735763974,
12845
+ "grad_norm": 8.25,
12846
+ "learning_rate": 0.0004942005135304012,
12847
+ "loss": 17.52,
12848
+ "step": 18340
12849
+ },
12850
+ {
12851
+ "epoch": 0.035782012470080095,
12852
+ "grad_norm": 8.9375,
12853
+ "learning_rate": 0.0004941972625149466,
12854
+ "loss": 17.6037,
12855
+ "step": 18350
12856
+ },
12857
+ {
12858
+ "epoch": 0.035801512204396216,
12859
+ "grad_norm": 8.25,
12860
+ "learning_rate": 0.0004941940114994919,
12861
+ "loss": 17.5273,
12862
+ "step": 18360
12863
+ },
12864
+ {
12865
+ "epoch": 0.03582101193871234,
12866
+ "grad_norm": 8.25,
12867
+ "learning_rate": 0.0004941907604840372,
12868
+ "loss": 17.4488,
12869
+ "step": 18370
12870
+ },
12871
+ {
12872
+ "epoch": 0.03584051167302846,
12873
+ "grad_norm": 8.5625,
12874
+ "learning_rate": 0.0004941875094685825,
12875
+ "loss": 17.5285,
12876
+ "step": 18380
12877
+ },
12878
+ {
12879
+ "epoch": 0.03586001140734457,
12880
+ "grad_norm": 10.125,
12881
+ "learning_rate": 0.0004941842584531279,
12882
+ "loss": 17.5049,
12883
+ "step": 18390
12884
+ },
12885
+ {
12886
+ "epoch": 0.03587951114166069,
12887
+ "grad_norm": 8.625,
12888
+ "learning_rate": 0.0004941810074376732,
12889
+ "loss": 17.5321,
12890
+ "step": 18400
12891
+ },
12892
+ {
12893
+ "epoch": 0.035899010875976814,
12894
+ "grad_norm": 10.5,
12895
+ "learning_rate": 0.0004941777564222185,
12896
+ "loss": 17.5864,
12897
+ "step": 18410
12898
+ },
12899
+ {
12900
+ "epoch": 0.035918510610292935,
12901
+ "grad_norm": 8.25,
12902
+ "learning_rate": 0.0004941745054067638,
12903
+ "loss": 17.4779,
12904
+ "step": 18420
12905
+ },
12906
+ {
12907
+ "epoch": 0.035938010344609056,
12908
+ "grad_norm": 7.9375,
12909
+ "learning_rate": 0.0004941712543913091,
12910
+ "loss": 17.4962,
12911
+ "step": 18430
12912
+ },
12913
+ {
12914
+ "epoch": 0.03595751007892518,
12915
+ "grad_norm": 8.1875,
12916
+ "learning_rate": 0.0004941680033758544,
12917
+ "loss": 17.449,
12918
+ "step": 18440
12919
+ },
12920
+ {
12921
+ "epoch": 0.0359770098132413,
12922
+ "grad_norm": 7.84375,
12923
+ "learning_rate": 0.0004941647523603997,
12924
+ "loss": 17.5084,
12925
+ "step": 18450
12926
+ },
12927
+ {
12928
+ "epoch": 0.03599650954755741,
12929
+ "grad_norm": 7.78125,
12930
+ "learning_rate": 0.0004941615013449451,
12931
+ "loss": 17.5459,
12932
+ "step": 18460
12933
+ },
12934
+ {
12935
+ "epoch": 0.03601600928187353,
12936
+ "grad_norm": 7.875,
12937
+ "learning_rate": 0.0004941582503294904,
12938
+ "loss": 17.5541,
12939
+ "step": 18470
12940
+ },
12941
+ {
12942
+ "epoch": 0.036035509016189654,
12943
+ "grad_norm": 9.125,
12944
+ "learning_rate": 0.0004941549993140357,
12945
+ "loss": 17.635,
12946
+ "step": 18480
12947
+ },
12948
+ {
12949
+ "epoch": 0.036055008750505775,
12950
+ "grad_norm": 7.34375,
12951
+ "learning_rate": 0.000494151748298581,
12952
+ "loss": 17.5686,
12953
+ "step": 18490
12954
+ },
12955
+ {
12956
+ "epoch": 0.036074508484821896,
12957
+ "grad_norm": 8.75,
12958
+ "learning_rate": 0.0004941484972831264,
12959
+ "loss": 17.6741,
12960
+ "step": 18500
12961
+ },
12962
+ {
12963
+ "epoch": 0.03609400821913802,
12964
+ "grad_norm": 7.84375,
12965
+ "learning_rate": 0.0004941452462676717,
12966
+ "loss": 17.6735,
12967
+ "step": 18510
12968
+ },
12969
+ {
12970
+ "epoch": 0.03611350795345413,
12971
+ "grad_norm": 9.0625,
12972
+ "learning_rate": 0.000494141995252217,
12973
+ "loss": 17.5192,
12974
+ "step": 18520
12975
+ },
12976
+ {
12977
+ "epoch": 0.03613300768777025,
12978
+ "grad_norm": 8.625,
12979
+ "learning_rate": 0.0004941387442367624,
12980
+ "loss": 17.3641,
12981
+ "step": 18530
12982
+ },
12983
+ {
12984
+ "epoch": 0.03615250742208637,
12985
+ "grad_norm": 7.78125,
12986
+ "learning_rate": 0.0004941354932213077,
12987
+ "loss": 17.586,
12988
+ "step": 18540
12989
+ },
12990
+ {
12991
+ "epoch": 0.036172007156402494,
12992
+ "grad_norm": 8.875,
12993
+ "learning_rate": 0.000494132242205853,
12994
+ "loss": 17.5065,
12995
+ "step": 18550
12996
+ },
12997
+ {
12998
+ "epoch": 0.036191506890718615,
12999
+ "grad_norm": 9.3125,
13000
+ "learning_rate": 0.0004941289911903983,
13001
+ "loss": 17.4222,
13002
+ "step": 18560
13003
+ },
13004
+ {
13005
+ "epoch": 0.036211006625034736,
13006
+ "grad_norm": 8.25,
13007
+ "learning_rate": 0.0004941257401749437,
13008
+ "loss": 17.513,
13009
+ "step": 18570
13010
+ },
13011
+ {
13012
+ "epoch": 0.03623050635935086,
13013
+ "grad_norm": 8.5625,
13014
+ "learning_rate": 0.000494122489159489,
13015
+ "loss": 17.5073,
13016
+ "step": 18580
13017
+ },
13018
+ {
13019
+ "epoch": 0.03625000609366697,
13020
+ "grad_norm": 11.875,
13021
+ "learning_rate": 0.0004941192381440343,
13022
+ "loss": 17.3283,
13023
+ "step": 18590
13024
+ },
13025
+ {
13026
+ "epoch": 0.03626950582798309,
13027
+ "grad_norm": 7.90625,
13028
+ "learning_rate": 0.0004941159871285797,
13029
+ "loss": 17.4716,
13030
+ "step": 18600
13031
+ },
13032
+ {
13033
+ "epoch": 0.03628900556229921,
13034
+ "grad_norm": 8.5,
13035
+ "learning_rate": 0.000494112736113125,
13036
+ "loss": 17.372,
13037
+ "step": 18610
13038
+ },
13039
+ {
13040
+ "epoch": 0.036308505296615334,
13041
+ "grad_norm": 8.75,
13042
+ "learning_rate": 0.0004941094850976703,
13043
+ "loss": 17.4356,
13044
+ "step": 18620
13045
+ },
13046
+ {
13047
+ "epoch": 0.036328005030931455,
13048
+ "grad_norm": 8.0,
13049
+ "learning_rate": 0.0004941062340822156,
13050
+ "loss": 17.5272,
13051
+ "step": 18630
13052
+ },
13053
+ {
13054
+ "epoch": 0.036347504765247576,
13055
+ "grad_norm": 7.65625,
13056
+ "learning_rate": 0.000494102983066761,
13057
+ "loss": 17.5673,
13058
+ "step": 18640
13059
+ },
13060
+ {
13061
+ "epoch": 0.03636700449956369,
13062
+ "grad_norm": 7.71875,
13063
+ "learning_rate": 0.0004940997320513063,
13064
+ "loss": 17.5017,
13065
+ "step": 18650
13066
+ },
13067
+ {
13068
+ "epoch": 0.03638650423387981,
13069
+ "grad_norm": 7.65625,
13070
+ "learning_rate": 0.0004940964810358516,
13071
+ "loss": 17.4877,
13072
+ "step": 18660
13073
+ },
13074
+ {
13075
+ "epoch": 0.03640600396819593,
13076
+ "grad_norm": 8.4375,
13077
+ "learning_rate": 0.0004940932300203969,
13078
+ "loss": 17.4823,
13079
+ "step": 18670
13080
+ },
13081
+ {
13082
+ "epoch": 0.03642550370251205,
13083
+ "grad_norm": 8.25,
13084
+ "learning_rate": 0.0004940899790049422,
13085
+ "loss": 17.4745,
13086
+ "step": 18680
13087
+ },
13088
+ {
13089
+ "epoch": 0.036445003436828174,
13090
+ "grad_norm": 9.0,
13091
+ "learning_rate": 0.0004940867279894875,
13092
+ "loss": 17.6121,
13093
+ "step": 18690
13094
+ },
13095
+ {
13096
+ "epoch": 0.036464503171144295,
13097
+ "grad_norm": 9.1875,
13098
+ "learning_rate": 0.0004940834769740328,
13099
+ "loss": 17.5465,
13100
+ "step": 18700
13101
+ },
13102
+ {
13103
+ "epoch": 0.036484002905460416,
13104
+ "grad_norm": 10.3125,
13105
+ "learning_rate": 0.0004940802259585782,
13106
+ "loss": 17.5631,
13107
+ "step": 18710
13108
+ },
13109
+ {
13110
+ "epoch": 0.03650350263977653,
13111
+ "grad_norm": 8.0625,
13112
+ "learning_rate": 0.0004940769749431235,
13113
+ "loss": 17.4597,
13114
+ "step": 18720
13115
+ },
13116
+ {
13117
+ "epoch": 0.03652300237409265,
13118
+ "grad_norm": 13.4375,
13119
+ "learning_rate": 0.0004940737239276688,
13120
+ "loss": 17.4941,
13121
+ "step": 18730
13122
+ },
13123
+ {
13124
+ "epoch": 0.03654250210840877,
13125
+ "grad_norm": 7.4375,
13126
+ "learning_rate": 0.0004940704729122142,
13127
+ "loss": 17.5893,
13128
+ "step": 18740
13129
+ },
13130
+ {
13131
+ "epoch": 0.036562001842724894,
13132
+ "grad_norm": 9.0625,
13133
+ "learning_rate": 0.0004940672218967595,
13134
+ "loss": 17.5582,
13135
+ "step": 18750
13136
+ },
13137
+ {
13138
+ "epoch": 0.036581501577041015,
13139
+ "grad_norm": 8.8125,
13140
+ "learning_rate": 0.0004940639708813048,
13141
+ "loss": 17.51,
13142
+ "step": 18760
13143
+ },
13144
+ {
13145
+ "epoch": 0.036601001311357136,
13146
+ "grad_norm": 8.1875,
13147
+ "learning_rate": 0.0004940607198658501,
13148
+ "loss": 17.4947,
13149
+ "step": 18770
13150
+ },
13151
+ {
13152
+ "epoch": 0.03662050104567325,
13153
+ "grad_norm": 8.0,
13154
+ "learning_rate": 0.0004940574688503955,
13155
+ "loss": 17.3291,
13156
+ "step": 18780
13157
+ },
13158
+ {
13159
+ "epoch": 0.03664000077998937,
13160
+ "grad_norm": 8.0625,
13161
+ "learning_rate": 0.0004940542178349408,
13162
+ "loss": 17.4147,
13163
+ "step": 18790
13164
+ },
13165
+ {
13166
+ "epoch": 0.03665950051430549,
13167
+ "grad_norm": 7.3125,
13168
+ "learning_rate": 0.0004940509668194861,
13169
+ "loss": 17.4158,
13170
+ "step": 18800
13171
+ },
13172
+ {
13173
+ "epoch": 0.03667900024862161,
13174
+ "grad_norm": 8.5625,
13175
+ "learning_rate": 0.0004940477158040315,
13176
+ "loss": 17.3498,
13177
+ "step": 18810
13178
+ },
13179
+ {
13180
+ "epoch": 0.036698499982937734,
13181
+ "grad_norm": 8.375,
13182
+ "learning_rate": 0.0004940444647885768,
13183
+ "loss": 17.5235,
13184
+ "step": 18820
13185
+ },
13186
+ {
13187
+ "epoch": 0.036717999717253855,
13188
+ "grad_norm": 8.125,
13189
+ "learning_rate": 0.000494041213773122,
13190
+ "loss": 17.5074,
13191
+ "step": 18830
13192
+ },
13193
+ {
13194
+ "epoch": 0.036737499451569976,
13195
+ "grad_norm": 8.4375,
13196
+ "learning_rate": 0.0004940379627576673,
13197
+ "loss": 17.5261,
13198
+ "step": 18840
13199
+ },
13200
+ {
13201
+ "epoch": 0.03675699918588609,
13202
+ "grad_norm": 26.625,
13203
+ "learning_rate": 0.0004940347117422127,
13204
+ "loss": 17.4652,
13205
+ "step": 18850
13206
+ },
13207
+ {
13208
+ "epoch": 0.03677649892020221,
13209
+ "grad_norm": 8.75,
13210
+ "learning_rate": 0.000494031460726758,
13211
+ "loss": 17.5258,
13212
+ "step": 18860
13213
+ },
13214
+ {
13215
+ "epoch": 0.03679599865451833,
13216
+ "grad_norm": 9.125,
13217
+ "learning_rate": 0.0004940282097113033,
13218
+ "loss": 17.487,
13219
+ "step": 18870
13220
+ },
13221
+ {
13222
+ "epoch": 0.03681549838883445,
13223
+ "grad_norm": 9.4375,
13224
+ "learning_rate": 0.0004940249586958486,
13225
+ "loss": 17.4624,
13226
+ "step": 18880
13227
+ },
13228
+ {
13229
+ "epoch": 0.036834998123150574,
13230
+ "grad_norm": 7.78125,
13231
+ "learning_rate": 0.000494021707680394,
13232
+ "loss": 17.4346,
13233
+ "step": 18890
13234
+ },
13235
+ {
13236
+ "epoch": 0.036854497857466695,
13237
+ "grad_norm": 9.1875,
13238
+ "learning_rate": 0.0004940184566649393,
13239
+ "loss": 17.4845,
13240
+ "step": 18900
13241
+ },
13242
+ {
13243
+ "epoch": 0.03687399759178281,
13244
+ "grad_norm": 8.25,
13245
+ "learning_rate": 0.0004940152056494846,
13246
+ "loss": 17.4851,
13247
+ "step": 18910
13248
+ },
13249
+ {
13250
+ "epoch": 0.03689349732609893,
13251
+ "grad_norm": 8.8125,
13252
+ "learning_rate": 0.00049401195463403,
13253
+ "loss": 17.423,
13254
+ "step": 18920
13255
+ },
13256
+ {
13257
+ "epoch": 0.03691299706041505,
13258
+ "grad_norm": 7.9375,
13259
+ "learning_rate": 0.0004940087036185753,
13260
+ "loss": 17.5391,
13261
+ "step": 18930
13262
+ },
13263
+ {
13264
+ "epoch": 0.03693249679473117,
13265
+ "grad_norm": 8.0625,
13266
+ "learning_rate": 0.0004940054526031206,
13267
+ "loss": 17.3817,
13268
+ "step": 18940
13269
+ },
13270
+ {
13271
+ "epoch": 0.03695199652904729,
13272
+ "grad_norm": 7.90625,
13273
+ "learning_rate": 0.0004940022015876659,
13274
+ "loss": 17.3288,
13275
+ "step": 18950
13276
+ },
13277
+ {
13278
+ "epoch": 0.036971496263363414,
13279
+ "grad_norm": 7.625,
13280
+ "learning_rate": 0.0004939989505722113,
13281
+ "loss": 17.4329,
13282
+ "step": 18960
13283
+ },
13284
+ {
13285
+ "epoch": 0.036990995997679535,
13286
+ "grad_norm": 7.625,
13287
+ "learning_rate": 0.0004939956995567566,
13288
+ "loss": 17.4302,
13289
+ "step": 18970
13290
+ },
13291
+ {
13292
+ "epoch": 0.03701049573199565,
13293
+ "grad_norm": 7.34375,
13294
+ "learning_rate": 0.0004939924485413019,
13295
+ "loss": 17.3948,
13296
+ "step": 18980
13297
+ },
13298
+ {
13299
+ "epoch": 0.03702999546631177,
13300
+ "grad_norm": 8.4375,
13301
+ "learning_rate": 0.0004939891975258471,
13302
+ "loss": 17.4144,
13303
+ "step": 18990
13304
+ },
13305
+ {
13306
+ "epoch": 0.03704949520062789,
13307
+ "grad_norm": 8.1875,
13308
+ "learning_rate": 0.0004939859465103925,
13309
+ "loss": 17.4472,
13310
+ "step": 19000
13311
  }
13312
  ],
13313
  "logging_steps": 10,
 
13327
  "attributes": {}
13328
  }
13329
  },
13330
+ "total_flos": 4.105398039319098e+19,
13331
  "train_batch_size": 48,
13332
  "trial_name": null,
13333
  "trial_params": null