jssky commited on
Commit
61176b4
·
verified ·
1 Parent(s): f6b8eed

Training in progress, step 360, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c4b4e0c119e19824e7bd5407b4bbc3376338122ebee9b6aeec68c49592370df
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f87937b8ca27ccd4299b7c5130ae44956cd82f7f19b3390773e63ba13d89567
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddee69f74c4e0b1f4c805c6781d6e486fedf745854f942f9e17ad08f3a76da24
3
- size 43122580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f877183493e32a2691b5c7511ce4bd58a6fd3cee41d4a62b9ca7258094d06da7
3
+ size 43123028
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9541b46c7b71871f1a3b9e3df6f3775e4939fad20c1d964c0daf912320c7f532
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee84c698affd10554f0eb51f34115cac8713e7377d5275afd8629075df0dbc22
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cc881deac2823d81b5585b733cf7bc610286ec0ee2764f20dd7976dbe33563b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42796949e3f84fa5a7101ae5e25b3cdd5ef9daeacff6fbfb32969265fb052fd
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5031446540880503,
5
  "eval_steps": 120,
6
- "global_step": 240,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1703,6 +1703,854 @@
1703
  "eval_samples_per_second": 14.403,
1704
  "eval_steps_per_second": 7.237,
1705
  "step": 240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1706
  }
1707
  ],
1708
  "logging_steps": 1,
@@ -1722,7 +2570,7 @@
1722
  "attributes": {}
1723
  }
1724
  },
1725
- "total_flos": 1.0089372973635994e+17,
1726
  "train_batch_size": 2,
1727
  "trial_name": null,
1728
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7547169811320755,
5
  "eval_steps": 120,
6
+ "global_step": 360,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1703
  "eval_samples_per_second": 14.403,
1704
  "eval_steps_per_second": 7.237,
1705
  "step": 240
1706
+ },
1707
+ {
1708
+ "epoch": 0.5052410901467506,
1709
+ "grad_norm": 0.4681568741798401,
1710
+ "learning_rate": 0.00010168171550280648,
1711
+ "loss": 1.4336,
1712
+ "step": 241
1713
+ },
1714
+ {
1715
+ "epoch": 0.5073375262054507,
1716
+ "grad_norm": 0.4985293447971344,
1717
+ "learning_rate": 0.00010100905974490651,
1718
+ "loss": 1.2779,
1719
+ "step": 242
1720
+ },
1721
+ {
1722
+ "epoch": 0.5094339622641509,
1723
+ "grad_norm": 0.5365903377532959,
1724
+ "learning_rate": 0.0001003363583222415,
1725
+ "loss": 1.5644,
1726
+ "step": 243
1727
+ },
1728
+ {
1729
+ "epoch": 0.5115303983228512,
1730
+ "grad_norm": 0.48949047923088074,
1731
+ "learning_rate": 9.96636416777585e-05,
1732
+ "loss": 1.287,
1733
+ "step": 244
1734
+ },
1735
+ {
1736
+ "epoch": 0.5136268343815513,
1737
+ "grad_norm": 0.5527725219726562,
1738
+ "learning_rate": 9.899094025509352e-05,
1739
+ "loss": 1.4812,
1740
+ "step": 245
1741
+ },
1742
+ {
1743
+ "epoch": 0.5157232704402516,
1744
+ "grad_norm": 0.5897342562675476,
1745
+ "learning_rate": 9.831828449719353e-05,
1746
+ "loss": 1.4931,
1747
+ "step": 246
1748
+ },
1749
+ {
1750
+ "epoch": 0.5178197064989518,
1751
+ "grad_norm": 0.5165984630584717,
1752
+ "learning_rate": 9.764570484493915e-05,
1753
+ "loss": 1.208,
1754
+ "step": 247
1755
+ },
1756
+ {
1757
+ "epoch": 0.519916142557652,
1758
+ "grad_norm": 0.654744565486908,
1759
+ "learning_rate": 9.697323173576667e-05,
1760
+ "loss": 1.6835,
1761
+ "step": 248
1762
+ },
1763
+ {
1764
+ "epoch": 0.5220125786163522,
1765
+ "grad_norm": 0.6669768691062927,
1766
+ "learning_rate": 9.630089560229088e-05,
1767
+ "loss": 1.1676,
1768
+ "step": 249
1769
+ },
1770
+ {
1771
+ "epoch": 0.5241090146750524,
1772
+ "grad_norm": 1.1078684329986572,
1773
+ "learning_rate": 9.562872687092783e-05,
1774
+ "loss": 1.4822,
1775
+ "step": 250
1776
+ },
1777
+ {
1778
+ "epoch": 0.5262054507337526,
1779
+ "grad_norm": 0.31687837839126587,
1780
+ "learning_rate": 9.495675596051777e-05,
1781
+ "loss": 1.1412,
1782
+ "step": 251
1783
+ },
1784
+ {
1785
+ "epoch": 0.5283018867924528,
1786
+ "grad_norm": 0.32858502864837646,
1787
+ "learning_rate": 9.428501328094855e-05,
1788
+ "loss": 1.1039,
1789
+ "step": 252
1790
+ },
1791
+ {
1792
+ "epoch": 0.5303983228511531,
1793
+ "grad_norm": 0.32667338848114014,
1794
+ "learning_rate": 9.36135292317796e-05,
1795
+ "loss": 1.2138,
1796
+ "step": 253
1797
+ },
1798
+ {
1799
+ "epoch": 0.5324947589098532,
1800
+ "grad_norm": 0.3239830732345581,
1801
+ "learning_rate": 9.294233420086603e-05,
1802
+ "loss": 1.3107,
1803
+ "step": 254
1804
+ },
1805
+ {
1806
+ "epoch": 0.5345911949685535,
1807
+ "grad_norm": 0.37542301416397095,
1808
+ "learning_rate": 9.227145856298344e-05,
1809
+ "loss": 1.1616,
1810
+ "step": 255
1811
+ },
1812
+ {
1813
+ "epoch": 0.5366876310272537,
1814
+ "grad_norm": 0.356937050819397,
1815
+ "learning_rate": 9.160093267845349e-05,
1816
+ "loss": 1.3193,
1817
+ "step": 256
1818
+ },
1819
+ {
1820
+ "epoch": 0.5387840670859538,
1821
+ "grad_norm": 0.31748053431510925,
1822
+ "learning_rate": 9.093078689176972e-05,
1823
+ "loss": 1.2206,
1824
+ "step": 257
1825
+ },
1826
+ {
1827
+ "epoch": 0.5408805031446541,
1828
+ "grad_norm": 0.3468354642391205,
1829
+ "learning_rate": 9.026105153022454e-05,
1830
+ "loss": 1.3771,
1831
+ "step": 258
1832
+ },
1833
+ {
1834
+ "epoch": 0.5429769392033543,
1835
+ "grad_norm": 0.35934552550315857,
1836
+ "learning_rate": 8.95917569025366e-05,
1837
+ "loss": 1.2956,
1838
+ "step": 259
1839
+ },
1840
+ {
1841
+ "epoch": 0.5450733752620545,
1842
+ "grad_norm": 0.35253018140792847,
1843
+ "learning_rate": 8.892293329747922e-05,
1844
+ "loss": 1.2584,
1845
+ "step": 260
1846
+ },
1847
+ {
1848
+ "epoch": 0.5471698113207547,
1849
+ "grad_norm": 0.3898825943470001,
1850
+ "learning_rate": 8.82546109825098e-05,
1851
+ "loss": 1.2658,
1852
+ "step": 261
1853
+ },
1854
+ {
1855
+ "epoch": 0.549266247379455,
1856
+ "grad_norm": 0.44399771094322205,
1857
+ "learning_rate": 8.758682020239984e-05,
1858
+ "loss": 1.314,
1859
+ "step": 262
1860
+ },
1861
+ {
1862
+ "epoch": 0.5513626834381551,
1863
+ "grad_norm": 0.3997746706008911,
1864
+ "learning_rate": 8.69195911778664e-05,
1865
+ "loss": 1.1614,
1866
+ "step": 263
1867
+ },
1868
+ {
1869
+ "epoch": 0.5534591194968553,
1870
+ "grad_norm": 0.38755127787590027,
1871
+ "learning_rate": 8.625295410420451e-05,
1872
+ "loss": 1.209,
1873
+ "step": 264
1874
+ },
1875
+ {
1876
+ "epoch": 0.5555555555555556,
1877
+ "grad_norm": 0.3919242024421692,
1878
+ "learning_rate": 8.558693914992046e-05,
1879
+ "loss": 1.4929,
1880
+ "step": 265
1881
+ },
1882
+ {
1883
+ "epoch": 0.5576519916142557,
1884
+ "grad_norm": 0.37072527408599854,
1885
+ "learning_rate": 8.492157645536678e-05,
1886
+ "loss": 1.0455,
1887
+ "step": 266
1888
+ },
1889
+ {
1890
+ "epoch": 0.559748427672956,
1891
+ "grad_norm": 0.39799362421035767,
1892
+ "learning_rate": 8.425689613137813e-05,
1893
+ "loss": 1.3028,
1894
+ "step": 267
1895
+ },
1896
+ {
1897
+ "epoch": 0.5618448637316562,
1898
+ "grad_norm": 0.34042948484420776,
1899
+ "learning_rate": 8.359292825790859e-05,
1900
+ "loss": 1.2346,
1901
+ "step": 268
1902
+ },
1903
+ {
1904
+ "epoch": 0.5639412997903563,
1905
+ "grad_norm": 0.41574835777282715,
1906
+ "learning_rate": 8.292970288267042e-05,
1907
+ "loss": 1.6697,
1908
+ "step": 269
1909
+ },
1910
+ {
1911
+ "epoch": 0.5660377358490566,
1912
+ "grad_norm": 0.39489176869392395,
1913
+ "learning_rate": 8.226725001977445e-05,
1914
+ "loss": 1.4031,
1915
+ "step": 270
1916
+ },
1917
+ {
1918
+ "epoch": 0.5681341719077568,
1919
+ "grad_norm": 0.37004798650741577,
1920
+ "learning_rate": 8.160559964837149e-05,
1921
+ "loss": 1.0657,
1922
+ "step": 271
1923
+ },
1924
+ {
1925
+ "epoch": 0.570230607966457,
1926
+ "grad_norm": 0.4158036410808563,
1927
+ "learning_rate": 8.094478171129588e-05,
1928
+ "loss": 1.4288,
1929
+ "step": 272
1930
+ },
1931
+ {
1932
+ "epoch": 0.5723270440251572,
1933
+ "grad_norm": 0.39994949102401733,
1934
+ "learning_rate": 8.028482611371028e-05,
1935
+ "loss": 1.4105,
1936
+ "step": 273
1937
+ },
1938
+ {
1939
+ "epoch": 0.5744234800838575,
1940
+ "grad_norm": 0.3869493007659912,
1941
+ "learning_rate": 7.96257627217524e-05,
1942
+ "loss": 1.452,
1943
+ "step": 274
1944
+ },
1945
+ {
1946
+ "epoch": 0.5765199161425576,
1947
+ "grad_norm": 0.41220688819885254,
1948
+ "learning_rate": 7.896762136118342e-05,
1949
+ "loss": 1.6473,
1950
+ "step": 275
1951
+ },
1952
+ {
1953
+ "epoch": 0.5786163522012578,
1954
+ "grad_norm": 0.4000371992588043,
1955
+ "learning_rate": 7.831043181603814e-05,
1956
+ "loss": 1.2974,
1957
+ "step": 276
1958
+ },
1959
+ {
1960
+ "epoch": 0.5807127882599581,
1961
+ "grad_norm": 0.43282032012939453,
1962
+ "learning_rate": 7.765422382727719e-05,
1963
+ "loss": 1.4971,
1964
+ "step": 277
1965
+ },
1966
+ {
1967
+ "epoch": 0.5828092243186582,
1968
+ "grad_norm": 0.37778139114379883,
1969
+ "learning_rate": 7.699902709144114e-05,
1970
+ "loss": 1.2826,
1971
+ "step": 278
1972
+ },
1973
+ {
1974
+ "epoch": 0.5849056603773585,
1975
+ "grad_norm": 0.3722570538520813,
1976
+ "learning_rate": 7.634487125930648e-05,
1977
+ "loss": 1.0699,
1978
+ "step": 279
1979
+ },
1980
+ {
1981
+ "epoch": 0.5870020964360587,
1982
+ "grad_norm": 0.4069075882434845,
1983
+ "learning_rate": 7.569178593454392e-05,
1984
+ "loss": 1.2477,
1985
+ "step": 280
1986
+ },
1987
+ {
1988
+ "epoch": 0.589098532494759,
1989
+ "grad_norm": 0.4309166669845581,
1990
+ "learning_rate": 7.503980067237852e-05,
1991
+ "loss": 1.2098,
1992
+ "step": 281
1993
+ },
1994
+ {
1995
+ "epoch": 0.5911949685534591,
1996
+ "grad_norm": 0.3983183205127716,
1997
+ "learning_rate": 7.438894497825235e-05,
1998
+ "loss": 1.2926,
1999
+ "step": 282
2000
+ },
2001
+ {
2002
+ "epoch": 0.5932914046121593,
2003
+ "grad_norm": 0.4100129008293152,
2004
+ "learning_rate": 7.373924830648904e-05,
2005
+ "loss": 1.2444,
2006
+ "step": 283
2007
+ },
2008
+ {
2009
+ "epoch": 0.5953878406708596,
2010
+ "grad_norm": 0.45969000458717346,
2011
+ "learning_rate": 7.309074005896103e-05,
2012
+ "loss": 1.5165,
2013
+ "step": 284
2014
+ },
2015
+ {
2016
+ "epoch": 0.5974842767295597,
2017
+ "grad_norm": 0.42328140139579773,
2018
+ "learning_rate": 7.244344958375881e-05,
2019
+ "loss": 1.2259,
2020
+ "step": 285
2021
+ },
2022
+ {
2023
+ "epoch": 0.59958071278826,
2024
+ "grad_norm": 0.4778403639793396,
2025
+ "learning_rate": 7.179740617386295e-05,
2026
+ "loss": 1.3914,
2027
+ "step": 286
2028
+ },
2029
+ {
2030
+ "epoch": 0.6016771488469602,
2031
+ "grad_norm": 0.5297214388847351,
2032
+ "learning_rate": 7.115263906581829e-05,
2033
+ "loss": 1.6115,
2034
+ "step": 287
2035
+ },
2036
+ {
2037
+ "epoch": 0.6037735849056604,
2038
+ "grad_norm": 0.5362441539764404,
2039
+ "learning_rate": 7.0509177438411e-05,
2040
+ "loss": 1.3647,
2041
+ "step": 288
2042
+ },
2043
+ {
2044
+ "epoch": 0.6058700209643606,
2045
+ "grad_norm": 0.4826620817184448,
2046
+ "learning_rate": 6.986705041134796e-05,
2047
+ "loss": 1.5252,
2048
+ "step": 289
2049
+ },
2050
+ {
2051
+ "epoch": 0.6079664570230608,
2052
+ "grad_norm": 0.4332706928253174,
2053
+ "learning_rate": 6.922628704393904e-05,
2054
+ "loss": 1.2502,
2055
+ "step": 290
2056
+ },
2057
+ {
2058
+ "epoch": 0.610062893081761,
2059
+ "grad_norm": 0.47122621536254883,
2060
+ "learning_rate": 6.858691633378202e-05,
2061
+ "loss": 1.2901,
2062
+ "step": 291
2063
+ },
2064
+ {
2065
+ "epoch": 0.6121593291404612,
2066
+ "grad_norm": 0.5842708945274353,
2067
+ "learning_rate": 6.794896721545032e-05,
2068
+ "loss": 1.5187,
2069
+ "step": 292
2070
+ },
2071
+ {
2072
+ "epoch": 0.6142557651991615,
2073
+ "grad_norm": 0.4648444652557373,
2074
+ "learning_rate": 6.73124685591835e-05,
2075
+ "loss": 1.2955,
2076
+ "step": 293
2077
+ },
2078
+ {
2079
+ "epoch": 0.6163522012578616,
2080
+ "grad_norm": 0.5161048173904419,
2081
+ "learning_rate": 6.667744916958085e-05,
2082
+ "loss": 1.4571,
2083
+ "step": 294
2084
+ },
2085
+ {
2086
+ "epoch": 0.6184486373165619,
2087
+ "grad_norm": 0.4833495020866394,
2088
+ "learning_rate": 6.604393778429772e-05,
2089
+ "loss": 1.0478,
2090
+ "step": 295
2091
+ },
2092
+ {
2093
+ "epoch": 0.6205450733752621,
2094
+ "grad_norm": 0.5789321660995483,
2095
+ "learning_rate": 6.541196307274517e-05,
2096
+ "loss": 1.3774,
2097
+ "step": 296
2098
+ },
2099
+ {
2100
+ "epoch": 0.6226415094339622,
2101
+ "grad_norm": 0.6312127709388733,
2102
+ "learning_rate": 6.478155363479236e-05,
2103
+ "loss": 1.6907,
2104
+ "step": 297
2105
+ },
2106
+ {
2107
+ "epoch": 0.6247379454926625,
2108
+ "grad_norm": 0.5820334553718567,
2109
+ "learning_rate": 6.415273799947234e-05,
2110
+ "loss": 1.1408,
2111
+ "step": 298
2112
+ },
2113
+ {
2114
+ "epoch": 0.6268343815513627,
2115
+ "grad_norm": 0.6500141024589539,
2116
+ "learning_rate": 6.352554462369112e-05,
2117
+ "loss": 1.3631,
2118
+ "step": 299
2119
+ },
2120
+ {
2121
+ "epoch": 0.6289308176100629,
2122
+ "grad_norm": 0.9458006024360657,
2123
+ "learning_rate": 6.290000189093959e-05,
2124
+ "loss": 1.3022,
2125
+ "step": 300
2126
+ },
2127
+ {
2128
+ "epoch": 0.6310272536687631,
2129
+ "grad_norm": 0.32660362124443054,
2130
+ "learning_rate": 6.227613811000925e-05,
2131
+ "loss": 1.015,
2132
+ "step": 301
2133
+ },
2134
+ {
2135
+ "epoch": 0.6331236897274634,
2136
+ "grad_norm": 0.2937367558479309,
2137
+ "learning_rate": 6.165398151371106e-05,
2138
+ "loss": 0.9367,
2139
+ "step": 302
2140
+ },
2141
+ {
2142
+ "epoch": 0.6352201257861635,
2143
+ "grad_norm": 0.37501296401023865,
2144
+ "learning_rate": 6.103356025759759e-05,
2145
+ "loss": 1.2524,
2146
+ "step": 303
2147
+ },
2148
+ {
2149
+ "epoch": 0.6373165618448637,
2150
+ "grad_norm": 0.29749903082847595,
2151
+ "learning_rate": 6.04149024186891e-05,
2152
+ "loss": 1.068,
2153
+ "step": 304
2154
+ },
2155
+ {
2156
+ "epoch": 0.639412997903564,
2157
+ "grad_norm": 0.2890242338180542,
2158
+ "learning_rate": 5.9798035994202836e-05,
2159
+ "loss": 1.1022,
2160
+ "step": 305
2161
+ },
2162
+ {
2163
+ "epoch": 0.6415094339622641,
2164
+ "grad_norm": 0.33415907621383667,
2165
+ "learning_rate": 5.918298890028591e-05,
2166
+ "loss": 1.4532,
2167
+ "step": 306
2168
+ },
2169
+ {
2170
+ "epoch": 0.6436058700209644,
2171
+ "grad_norm": 0.2996525168418884,
2172
+ "learning_rate": 5.8569788970752114e-05,
2173
+ "loss": 1.0128,
2174
+ "step": 307
2175
+ },
2176
+ {
2177
+ "epoch": 0.6457023060796646,
2178
+ "grad_norm": 0.343421071767807,
2179
+ "learning_rate": 5.795846395582225e-05,
2180
+ "loss": 1.0972,
2181
+ "step": 308
2182
+ },
2183
+ {
2184
+ "epoch": 0.6477987421383647,
2185
+ "grad_norm": 0.32696643471717834,
2186
+ "learning_rate": 5.734904152086828e-05,
2187
+ "loss": 1.2231,
2188
+ "step": 309
2189
+ },
2190
+ {
2191
+ "epoch": 0.649895178197065,
2192
+ "grad_norm": 0.34797459840774536,
2193
+ "learning_rate": 5.6741549245161285e-05,
2194
+ "loss": 1.2454,
2195
+ "step": 310
2196
+ },
2197
+ {
2198
+ "epoch": 0.6519916142557652,
2199
+ "grad_norm": 0.3834897577762604,
2200
+ "learning_rate": 5.6136014620623525e-05,
2201
+ "loss": 1.288,
2202
+ "step": 311
2203
+ },
2204
+ {
2205
+ "epoch": 0.6540880503144654,
2206
+ "grad_norm": 0.3343852162361145,
2207
+ "learning_rate": 5.5532465050584206e-05,
2208
+ "loss": 1.1281,
2209
+ "step": 312
2210
+ },
2211
+ {
2212
+ "epoch": 0.6561844863731656,
2213
+ "grad_norm": 0.33837154507637024,
2214
+ "learning_rate": 5.4930927848539256e-05,
2215
+ "loss": 1.1146,
2216
+ "step": 313
2217
+ },
2218
+ {
2219
+ "epoch": 0.6582809224318659,
2220
+ "grad_norm": 0.395158052444458,
2221
+ "learning_rate": 5.433143023691547e-05,
2222
+ "loss": 1.2687,
2223
+ "step": 314
2224
+ },
2225
+ {
2226
+ "epoch": 0.660377358490566,
2227
+ "grad_norm": 0.3619198203086853,
2228
+ "learning_rate": 5.373399934583839e-05,
2229
+ "loss": 1.3387,
2230
+ "step": 315
2231
+ },
2232
+ {
2233
+ "epoch": 0.6624737945492662,
2234
+ "grad_norm": 0.3589290380477905,
2235
+ "learning_rate": 5.3138662211904654e-05,
2236
+ "loss": 1.1821,
2237
+ "step": 316
2238
+ },
2239
+ {
2240
+ "epoch": 0.6645702306079665,
2241
+ "grad_norm": 0.46267178654670715,
2242
+ "learning_rate": 5.25454457769583e-05,
2243
+ "loss": 1.3747,
2244
+ "step": 317
2245
+ },
2246
+ {
2247
+ "epoch": 0.6666666666666666,
2248
+ "grad_norm": 0.35914307832717896,
2249
+ "learning_rate": 5.1954376886871746e-05,
2250
+ "loss": 1.0113,
2251
+ "step": 318
2252
+ },
2253
+ {
2254
+ "epoch": 0.6687631027253669,
2255
+ "grad_norm": 0.3956226408481598,
2256
+ "learning_rate": 5.1365482290330645e-05,
2257
+ "loss": 1.2348,
2258
+ "step": 319
2259
+ },
2260
+ {
2261
+ "epoch": 0.6708595387840671,
2262
+ "grad_norm": 0.3596416115760803,
2263
+ "learning_rate": 5.07787886376236e-05,
2264
+ "loss": 1.0949,
2265
+ "step": 320
2266
+ },
2267
+ {
2268
+ "epoch": 0.6729559748427673,
2269
+ "grad_norm": 0.4158559739589691,
2270
+ "learning_rate": 5.019432247943595e-05,
2271
+ "loss": 1.236,
2272
+ "step": 321
2273
+ },
2274
+ {
2275
+ "epoch": 0.6750524109014675,
2276
+ "grad_norm": 0.4549407958984375,
2277
+ "learning_rate": 4.961211026564837e-05,
2278
+ "loss": 1.3363,
2279
+ "step": 322
2280
+ },
2281
+ {
2282
+ "epoch": 0.6771488469601677,
2283
+ "grad_norm": 0.3847133219242096,
2284
+ "learning_rate": 4.90321783441397e-05,
2285
+ "loss": 0.9304,
2286
+ "step": 323
2287
+ },
2288
+ {
2289
+ "epoch": 0.6792452830188679,
2290
+ "grad_norm": 0.381661057472229,
2291
+ "learning_rate": 4.845455295959468e-05,
2292
+ "loss": 0.9649,
2293
+ "step": 324
2294
+ },
2295
+ {
2296
+ "epoch": 0.6813417190775681,
2297
+ "grad_norm": 0.3821321129798889,
2298
+ "learning_rate": 4.787926025231634e-05,
2299
+ "loss": 1.128,
2300
+ "step": 325
2301
+ },
2302
+ {
2303
+ "epoch": 0.6834381551362684,
2304
+ "grad_norm": 0.42627325654029846,
2305
+ "learning_rate": 4.730632625704288e-05,
2306
+ "loss": 1.2158,
2307
+ "step": 326
2308
+ },
2309
+ {
2310
+ "epoch": 0.6855345911949685,
2311
+ "grad_norm": 0.5116820931434631,
2312
+ "learning_rate": 4.673577690176956e-05,
2313
+ "loss": 1.228,
2314
+ "step": 327
2315
+ },
2316
+ {
2317
+ "epoch": 0.6876310272536688,
2318
+ "grad_norm": 0.37617841362953186,
2319
+ "learning_rate": 4.616763800657534e-05,
2320
+ "loss": 1.0418,
2321
+ "step": 328
2322
+ },
2323
+ {
2324
+ "epoch": 0.689727463312369,
2325
+ "grad_norm": 0.40348488092422485,
2326
+ "learning_rate": 4.560193528245425e-05,
2327
+ "loss": 1.303,
2328
+ "step": 329
2329
+ },
2330
+ {
2331
+ "epoch": 0.6918238993710691,
2332
+ "grad_norm": 0.39703086018562317,
2333
+ "learning_rate": 4.5038694330152135e-05,
2334
+ "loss": 1.0071,
2335
+ "step": 330
2336
+ },
2337
+ {
2338
+ "epoch": 0.6939203354297694,
2339
+ "grad_norm": 0.4976246654987335,
2340
+ "learning_rate": 4.447794063900772e-05,
2341
+ "loss": 1.4131,
2342
+ "step": 331
2343
+ },
2344
+ {
2345
+ "epoch": 0.6960167714884696,
2346
+ "grad_norm": 0.4676234722137451,
2347
+ "learning_rate": 4.391969958579948e-05,
2348
+ "loss": 1.4407,
2349
+ "step": 332
2350
+ },
2351
+ {
2352
+ "epoch": 0.6981132075471698,
2353
+ "grad_norm": 0.47766080498695374,
2354
+ "learning_rate": 4.3363996433596954e-05,
2355
+ "loss": 1.196,
2356
+ "step": 333
2357
+ },
2358
+ {
2359
+ "epoch": 0.70020964360587,
2360
+ "grad_norm": 0.45446816086769104,
2361
+ "learning_rate": 4.281085633061764e-05,
2362
+ "loss": 1.4197,
2363
+ "step": 334
2364
+ },
2365
+ {
2366
+ "epoch": 0.7023060796645703,
2367
+ "grad_norm": 0.4232485294342041,
2368
+ "learning_rate": 4.2260304309088696e-05,
2369
+ "loss": 1.1722,
2370
+ "step": 335
2371
+ },
2372
+ {
2373
+ "epoch": 0.7044025157232704,
2374
+ "grad_norm": 0.4267813563346863,
2375
+ "learning_rate": 4.171236528411436e-05,
2376
+ "loss": 1.1742,
2377
+ "step": 336
2378
+ },
2379
+ {
2380
+ "epoch": 0.7064989517819706,
2381
+ "grad_norm": 0.5247170329093933,
2382
+ "learning_rate": 4.116706405254834e-05,
2383
+ "loss": 1.4083,
2384
+ "step": 337
2385
+ },
2386
+ {
2387
+ "epoch": 0.7085953878406709,
2388
+ "grad_norm": 0.4244326949119568,
2389
+ "learning_rate": 4.0624425291871506e-05,
2390
+ "loss": 0.9588,
2391
+ "step": 338
2392
+ },
2393
+ {
2394
+ "epoch": 0.710691823899371,
2395
+ "grad_norm": 0.47463715076446533,
2396
+ "learning_rate": 4.0084473559075333e-05,
2397
+ "loss": 1.2465,
2398
+ "step": 339
2399
+ },
2400
+ {
2401
+ "epoch": 0.7127882599580713,
2402
+ "grad_norm": 0.5082678198814392,
2403
+ "learning_rate": 3.9547233289550356e-05,
2404
+ "loss": 1.2823,
2405
+ "step": 340
2406
+ },
2407
+ {
2408
+ "epoch": 0.7148846960167715,
2409
+ "grad_norm": 0.4892716407775879,
2410
+ "learning_rate": 3.901272879598058e-05,
2411
+ "loss": 1.3535,
2412
+ "step": 341
2413
+ },
2414
+ {
2415
+ "epoch": 0.7169811320754716,
2416
+ "grad_norm": 0.4783913791179657,
2417
+ "learning_rate": 3.848098426724306e-05,
2418
+ "loss": 1.2831,
2419
+ "step": 342
2420
+ },
2421
+ {
2422
+ "epoch": 0.7190775681341719,
2423
+ "grad_norm": 0.551485002040863,
2424
+ "learning_rate": 3.7952023767313264e-05,
2425
+ "loss": 1.4206,
2426
+ "step": 343
2427
+ },
2428
+ {
2429
+ "epoch": 0.7211740041928721,
2430
+ "grad_norm": 0.5458889007568359,
2431
+ "learning_rate": 3.7425871234176134e-05,
2432
+ "loss": 1.192,
2433
+ "step": 344
2434
+ },
2435
+ {
2436
+ "epoch": 0.7232704402515723,
2437
+ "grad_norm": 0.5561458468437195,
2438
+ "learning_rate": 3.690255047874267e-05,
2439
+ "loss": 1.2386,
2440
+ "step": 345
2441
+ },
2442
+ {
2443
+ "epoch": 0.7253668763102725,
2444
+ "grad_norm": 0.5652120113372803,
2445
+ "learning_rate": 3.6382085183772394e-05,
2446
+ "loss": 1.1185,
2447
+ "step": 346
2448
+ },
2449
+ {
2450
+ "epoch": 0.7274633123689728,
2451
+ "grad_norm": 0.6339935064315796,
2452
+ "learning_rate": 3.586449890280172e-05,
2453
+ "loss": 1.406,
2454
+ "step": 347
2455
+ },
2456
+ {
2457
+ "epoch": 0.7295597484276729,
2458
+ "grad_norm": 0.6352638006210327,
2459
+ "learning_rate": 3.534981505907792e-05,
2460
+ "loss": 1.4451,
2461
+ "step": 348
2462
+ },
2463
+ {
2464
+ "epoch": 0.7316561844863732,
2465
+ "grad_norm": 0.7120410203933716,
2466
+ "learning_rate": 3.483805694449913e-05,
2467
+ "loss": 1.0784,
2468
+ "step": 349
2469
+ },
2470
+ {
2471
+ "epoch": 0.7337526205450734,
2472
+ "grad_norm": 0.9480962753295898,
2473
+ "learning_rate": 3.432924771856029e-05,
2474
+ "loss": 1.1712,
2475
+ "step": 350
2476
+ },
2477
+ {
2478
+ "epoch": 0.7358490566037735,
2479
+ "grad_norm": 0.35054638981819153,
2480
+ "learning_rate": 3.3823410407305015e-05,
2481
+ "loss": 1.3018,
2482
+ "step": 351
2483
+ },
2484
+ {
2485
+ "epoch": 0.7379454926624738,
2486
+ "grad_norm": 0.30331677198410034,
2487
+ "learning_rate": 3.33205679022837e-05,
2488
+ "loss": 1.2488,
2489
+ "step": 352
2490
+ },
2491
+ {
2492
+ "epoch": 0.740041928721174,
2493
+ "grad_norm": 0.30316171050071716,
2494
+ "learning_rate": 3.2820742959517436e-05,
2495
+ "loss": 0.9948,
2496
+ "step": 353
2497
+ },
2498
+ {
2499
+ "epoch": 0.7421383647798742,
2500
+ "grad_norm": 0.3911697268486023,
2501
+ "learning_rate": 3.232395819846824e-05,
2502
+ "loss": 1.1168,
2503
+ "step": 354
2504
+ },
2505
+ {
2506
+ "epoch": 0.7442348008385744,
2507
+ "grad_norm": 0.3382274806499481,
2508
+ "learning_rate": 3.1830236101015446e-05,
2509
+ "loss": 1.0427,
2510
+ "step": 355
2511
+ },
2512
+ {
2513
+ "epoch": 0.7463312368972747,
2514
+ "grad_norm": 0.32363229990005493,
2515
+ "learning_rate": 3.1339599010438134e-05,
2516
+ "loss": 0.94,
2517
+ "step": 356
2518
+ },
2519
+ {
2520
+ "epoch": 0.7484276729559748,
2521
+ "grad_norm": 0.3242010772228241,
2522
+ "learning_rate": 3.0852069130404284e-05,
2523
+ "loss": 1.2767,
2524
+ "step": 357
2525
+ },
2526
+ {
2527
+ "epoch": 0.750524109014675,
2528
+ "grad_norm": 0.33006346225738525,
2529
+ "learning_rate": 3.036766852396561e-05,
2530
+ "loss": 0.9687,
2531
+ "step": 358
2532
+ },
2533
+ {
2534
+ "epoch": 0.7526205450733753,
2535
+ "grad_norm": 0.35846400260925293,
2536
+ "learning_rate": 2.9886419112559394e-05,
2537
+ "loss": 1.2036,
2538
+ "step": 359
2539
+ },
2540
+ {
2541
+ "epoch": 0.7547169811320755,
2542
+ "grad_norm": 0.35825300216674805,
2543
+ "learning_rate": 2.9408342675016286e-05,
2544
+ "loss": 1.4004,
2545
+ "step": 360
2546
+ },
2547
+ {
2548
+ "epoch": 0.7547169811320755,
2549
+ "eval_loss": 1.2862719297409058,
2550
+ "eval_runtime": 13.9861,
2551
+ "eval_samples_per_second": 14.371,
2552
+ "eval_steps_per_second": 7.221,
2553
+ "step": 360
2554
  }
2555
  ],
2556
  "logging_steps": 1,
 
2570
  "attributes": {}
2571
  }
2572
  },
2573
+ "total_flos": 1.528664851092603e+17,
2574
  "train_batch_size": 2,
2575
  "trial_name": null,
2576
  "trial_params": null