AlekseyKorshuk commited on
Commit
e2275aa
1 Parent(s): 60643e2

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1cw72fwg/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1s9gn4n3) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1s9gn4n3/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
45
  dataset = load_dataset("huggingartists/eminem")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/29aftir4/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Eminem's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1yj0yyz9) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1yj0yyz9/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 0.7975038886070251, "eval_runtime": 7.9777, "eval_samples_per_second": 81.352, "eval_steps_per_second": 10.279, "epoch": 3.0}
 
1
+ {"eval_loss": 0.4826279878616333, "eval_runtime": 14.6559, "eval_samples_per_second": 44.897, "eval_steps_per_second": 5.663, "epoch": 4.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:195352921c9768b8f7722a7c6dadd24e0bb41f9c5b687a015dad8577e2e19d1e
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:428633d1f6458ab43ed5eee44c1ef0505fe223f57c99426fe21435d1aa3b434b
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d220c780eb71062e07399c346abb83a3d8d5d253a988aae163afad0591fdd04
3
  size 995604017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f52b4dcb8476f623a272e10ac84ac049de0f1003081728e8884c6d2f759310f9
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1ea0a731add3655e830275a1f48125cb7f3420e6c2aa5a5112f441a84aaadeb
3
  size 510396521
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:753a59950d6b66574f6214a57384f99ae242cd291bd19d5d9e54977a42f5f557
3
  size 510396521
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85a94ba790ee5732238750868c0fa5cd84aca898f6e3c21a553f5290ecb0fd85
3
  size 14567
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32f46f1e2250c32c3ee6983a1c0513e245212fb9a74b0dd129330fa6abe719c7
3
  size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b357301156ff71969e9f7ff0267a328310fe92287f1483cfbf7826710ed8a0b
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33680f79177570c60fb9a75223c44dc245db8eeb97bbb366911a9c63f50a44d8
3
  size 623
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 0.7975038886070251,
3
- "best_model_checkpoint": "output/eminem/checkpoint-1455",
4
- "epoch": 3.0,
5
- "global_step": 1455,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1776,11 +1776,595 @@
1776
  "eval_samples_per_second": 81.452,
1777
  "eval_steps_per_second": 10.291,
1778
  "step": 1455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1779
  }
1780
  ],
1781
- "max_steps": 1455,
1782
- "num_train_epochs": 3,
1783
- "total_flos": 1519151874048000.0,
1784
  "trial_name": null,
1785
  "trial_params": null
1786
  }
 
1
  {
2
+ "best_metric": 0.4826279878616333,
3
+ "best_model_checkpoint": "output/eminem/checkpoint-1936",
4
+ "epoch": 4.0,
5
+ "global_step": 1936,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1776
  "eval_samples_per_second": 81.452,
1777
  "eval_steps_per_second": 10.291,
1778
  "step": 1455
1779
+ },
1780
+ {
1781
+ "epoch": 3.02,
1782
+ "learning_rate": 3.533340602884326e-05,
1783
+ "loss": 0.6584,
1784
+ "step": 1460
1785
+ },
1786
+ {
1787
+ "epoch": 3.03,
1788
+ "learning_rate": 3.340418885898272e-05,
1789
+ "loss": 0.9188,
1790
+ "step": 1465
1791
+ },
1792
+ {
1793
+ "epoch": 3.04,
1794
+ "learning_rate": 3.1512039922545005e-05,
1795
+ "loss": 0.7885,
1796
+ "step": 1470
1797
+ },
1798
+ {
1799
+ "epoch": 3.05,
1800
+ "learning_rate": 2.9658952031065493e-05,
1801
+ "loss": 0.6785,
1802
+ "step": 1475
1803
+ },
1804
+ {
1805
+ "epoch": 3.06,
1806
+ "learning_rate": 2.784687685697823e-05,
1807
+ "loss": 0.9316,
1808
+ "step": 1480
1809
+ },
1810
+ {
1811
+ "epoch": 3.07,
1812
+ "learning_rate": 2.6077722878123704e-05,
1813
+ "loss": 0.7074,
1814
+ "step": 1485
1815
+ },
1816
+ {
1817
+ "epoch": 3.08,
1818
+ "learning_rate": 2.4353353367727083e-05,
1819
+ "loss": 0.5083,
1820
+ "step": 1490
1821
+ },
1822
+ {
1823
+ "epoch": 3.09,
1824
+ "learning_rate": 2.2675584432009507e-05,
1825
+ "loss": 0.7031,
1826
+ "step": 1495
1827
+ },
1828
+ {
1829
+ "epoch": 3.1,
1830
+ "learning_rate": 2.104618309745989e-05,
1831
+ "loss": 0.5955,
1832
+ "step": 1500
1833
+ },
1834
+ {
1835
+ "epoch": 3.11,
1836
+ "learning_rate": 1.94668654497983e-05,
1837
+ "loss": 0.6515,
1838
+ "step": 1505
1839
+ },
1840
+ {
1841
+ "epoch": 3.12,
1842
+ "learning_rate": 1.7939294826601462e-05,
1843
+ "loss": 0.6982,
1844
+ "step": 1510
1845
+ },
1846
+ {
1847
+ "epoch": 3.13,
1848
+ "learning_rate": 1.6465080065474947e-05,
1849
+ "loss": 0.7575,
1850
+ "step": 1515
1851
+ },
1852
+ {
1853
+ "epoch": 3.14,
1854
+ "learning_rate": 1.504577380961979e-05,
1855
+ "loss": 0.7294,
1856
+ "step": 1520
1857
+ },
1858
+ {
1859
+ "epoch": 3.15,
1860
+ "learning_rate": 1.3682870872588951e-05,
1861
+ "loss": 0.566,
1862
+ "step": 1525
1863
+ },
1864
+ {
1865
+ "epoch": 3.16,
1866
+ "learning_rate": 1.2377806663955526e-05,
1867
+ "loss": 0.4467,
1868
+ "step": 1530
1869
+ },
1870
+ {
1871
+ "epoch": 3.17,
1872
+ "learning_rate": 1.1131955677535227e-05,
1873
+ "loss": 0.8649,
1874
+ "step": 1535
1875
+ },
1876
+ {
1877
+ "epoch": 3.18,
1878
+ "learning_rate": 9.946630043766268e-06,
1879
+ "loss": 0.6318,
1880
+ "step": 1540
1881
+ },
1882
+ {
1883
+ "epoch": 3.19,
1884
+ "learning_rate": 8.8230781477809e-06,
1885
+ "loss": 0.5197,
1886
+ "step": 1545
1887
+ },
1888
+ {
1889
+ "epoch": 3.2,
1890
+ "learning_rate": 7.762483314597675e-06,
1891
+ "loss": 0.7195,
1892
+ "step": 1550
1893
+ },
1894
+ {
1895
+ "epoch": 3.21,
1896
+ "learning_rate": 6.765962562849582e-06,
1897
+ "loss": 0.6544,
1898
+ "step": 1555
1899
+ },
1900
+ {
1901
+ "epoch": 3.22,
1902
+ "learning_rate": 5.834565428339295e-06,
1903
+ "loss": 0.6373,
1904
+ "step": 1560
1905
+ },
1906
+ {
1907
+ "epoch": 3.23,
1908
+ "learning_rate": 4.969272858664371e-06,
1909
+ "loss": 0.7969,
1910
+ "step": 1565
1911
+ },
1912
+ {
1913
+ "epoch": 3.24,
1914
+ "learning_rate": 4.170996180083756e-06,
1915
+ "loss": 0.5398,
1916
+ "step": 1570
1917
+ },
1918
+ {
1919
+ "epoch": 3.25,
1920
+ "learning_rate": 3.440576137712397e-06,
1921
+ "loss": 0.6392,
1922
+ "step": 1575
1923
+ },
1924
+ {
1925
+ "epoch": 3.26,
1926
+ "learning_rate": 2.778782010045892e-06,
1927
+ "loss": 0.8183,
1928
+ "step": 1580
1929
+ },
1930
+ {
1931
+ "epoch": 3.27,
1932
+ "learning_rate": 2.186310798754732e-06,
1933
+ "loss": 0.4459,
1934
+ "step": 1585
1935
+ },
1936
+ {
1937
+ "epoch": 3.29,
1938
+ "learning_rate": 1.6637864946060192e-06,
1939
+ "loss": 0.6059,
1940
+ "step": 1590
1941
+ },
1942
+ {
1943
+ "epoch": 3.3,
1944
+ "learning_rate": 1.2117594202713427e-06,
1945
+ "loss": 0.9818,
1946
+ "step": 1595
1947
+ },
1948
+ {
1949
+ "epoch": 3.31,
1950
+ "learning_rate": 8.307056507336243e-07,
1951
+ "loss": 0.3983,
1952
+ "step": 1600
1953
+ },
1954
+ {
1955
+ "epoch": 3.32,
1956
+ "learning_rate": 5.21026511876941e-07,
1957
+ "loss": 0.9094,
1958
+ "step": 1605
1959
+ },
1960
+ {
1961
+ "epoch": 3.33,
1962
+ "learning_rate": 2.8304815781598153e-07,
1963
+ "loss": 0.5006,
1964
+ "step": 1610
1965
+ },
1966
+ {
1967
+ "epoch": 3.34,
1968
+ "learning_rate": 1.1702122738749632e-07,
1969
+ "loss": 0.5685,
1970
+ "step": 1615
1971
+ },
1972
+ {
1973
+ "epoch": 3.35,
1974
+ "learning_rate": 2.312058017904961e-08,
1975
+ "loss": 0.7558,
1976
+ "step": 1620
1977
+ },
1978
+ {
1979
+ "epoch": 3.36,
1980
+ "learning_rate": 1.445112365939849e-09,
1981
+ "loss": 0.5613,
1982
+ "step": 1625
1983
+ },
1984
+ {
1985
+ "epoch": 3.37,
1986
+ "learning_rate": 5.2017652553981476e-08,
1987
+ "loss": 0.7474,
1988
+ "step": 1630
1989
+ },
1990
+ {
1991
+ "epoch": 3.38,
1992
+ "learning_rate": 1.7478493773641084e-07,
1993
+ "loss": 0.7388,
1994
+ "step": 1635
1995
+ },
1996
+ {
1997
+ "epoch": 3.39,
1998
+ "learning_rate": 3.69617669391192e-07,
1999
+ "loss": 0.5788,
2000
+ "step": 1640
2001
+ },
2002
+ {
2003
+ "epoch": 3.4,
2004
+ "learning_rate": 6.363106496559428e-07,
2005
+ "loss": 0.6158,
2006
+ "step": 1645
2007
+ },
2008
+ {
2009
+ "epoch": 3.41,
2010
+ "learning_rate": 9.745829974457439e-07,
2011
+ "loss": 0.6697,
2012
+ "step": 1650
2013
+ },
2014
+ {
2015
+ "epoch": 3.42,
2016
+ "learning_rate": 1.3840784442740293e-06,
2017
+ "loss": 0.5538,
2018
+ "step": 1655
2019
+ },
2020
+ {
2021
+ "epoch": 3.43,
2022
+ "learning_rate": 1.8643657094772689e-06,
2023
+ "loss": 0.7529,
2024
+ "step": 1660
2025
+ },
2026
+ {
2027
+ "epoch": 3.44,
2028
+ "learning_rate": 2.4149389544374383e-06,
2029
+ "loss": 0.5164,
2030
+ "step": 1665
2031
+ },
2032
+ {
2033
+ "epoch": 3.45,
2034
+ "learning_rate": 3.035218315332177e-06,
2035
+ "loss": 0.5686,
2036
+ "step": 1670
2037
+ },
2038
+ {
2039
+ "epoch": 3.46,
2040
+ "learning_rate": 3.7245505138493062e-06,
2041
+ "loss": 0.7307,
2042
+ "step": 1675
2043
+ },
2044
+ {
2045
+ "epoch": 3.47,
2046
+ "learning_rate": 4.482209545215727e-06,
2047
+ "loss": 0.4993,
2048
+ "step": 1680
2049
+ },
2050
+ {
2051
+ "epoch": 3.48,
2052
+ "learning_rate": 5.307397442832051e-06,
2053
+ "loss": 0.5905,
2054
+ "step": 1685
2055
+ },
2056
+ {
2057
+ "epoch": 3.49,
2058
+ "learning_rate": 6.199245118679951e-06,
2059
+ "loss": 0.7372,
2060
+ "step": 1690
2061
+ },
2062
+ {
2063
+ "epoch": 3.5,
2064
+ "learning_rate": 7.15681327866027e-06,
2065
+ "loss": 0.7588,
2066
+ "step": 1695
2067
+ },
2068
+ {
2069
+ "epoch": 3.51,
2070
+ "learning_rate": 8.179093411845718e-06,
2071
+ "loss": 0.7609,
2072
+ "step": 1700
2073
+ },
2074
+ {
2075
+ "epoch": 3.52,
2076
+ "learning_rate": 9.265008852652901e-06,
2077
+ "loss": 0.4208,
2078
+ "step": 1705
2079
+ },
2080
+ {
2081
+ "epoch": 3.53,
2082
+ "learning_rate": 1.0413415914780494e-05,
2083
+ "loss": 0.5304,
2084
+ "step": 1710
2085
+ },
2086
+ {
2087
+ "epoch": 3.54,
2088
+ "learning_rate": 1.1623105095742268e-05,
2089
+ "loss": 0.47,
2090
+ "step": 1715
2091
+ },
2092
+ {
2093
+ "epoch": 3.55,
2094
+ "learning_rate": 1.2892802350718737e-05,
2095
+ "loss": 0.4479,
2096
+ "step": 1720
2097
+ },
2098
+ {
2099
+ "epoch": 3.56,
2100
+ "learning_rate": 1.4221170434376129e-05,
2101
+ "loss": 0.3907,
2102
+ "step": 1725
2103
+ },
2104
+ {
2105
+ "epoch": 3.57,
2106
+ "learning_rate": 1.5606810309252903e-05,
2107
+ "loss": 0.8871,
2108
+ "step": 1730
2109
+ },
2110
+ {
2111
+ "epoch": 3.58,
2112
+ "learning_rate": 1.7048262619230822e-05,
2113
+ "loss": 0.5595,
2114
+ "step": 1735
2115
+ },
2116
+ {
2117
+ "epoch": 3.6,
2118
+ "learning_rate": 1.8544009226529548e-05,
2119
+ "loss": 0.6169,
2120
+ "step": 1740
2121
+ },
2122
+ {
2123
+ "epoch": 3.61,
2124
+ "learning_rate": 2.0092474810601755e-05,
2125
+ "loss": 0.557,
2126
+ "step": 1745
2127
+ },
2128
+ {
2129
+ "epoch": 3.62,
2130
+ "learning_rate": 2.169202852727143e-05,
2131
+ "loss": 0.8663,
2132
+ "step": 1750
2133
+ },
2134
+ {
2135
+ "epoch": 3.63,
2136
+ "learning_rate": 2.334098572632982e-05,
2137
+ "loss": 0.754,
2138
+ "step": 1755
2139
+ },
2140
+ {
2141
+ "epoch": 3.64,
2142
+ "learning_rate": 2.5037609725813137e-05,
2143
+ "loss": 0.7106,
2144
+ "step": 1760
2145
+ },
2146
+ {
2147
+ "epoch": 3.65,
2148
+ "learning_rate": 2.6780113641080935e-05,
2149
+ "loss": 0.8902,
2150
+ "step": 1765
2151
+ },
2152
+ {
2153
+ "epoch": 3.66,
2154
+ "learning_rate": 2.8566662266755925e-05,
2155
+ "loss": 0.5382,
2156
+ "step": 1770
2157
+ },
2158
+ {
2159
+ "epoch": 3.67,
2160
+ "learning_rate": 3.039537400956214e-05,
2161
+ "loss": 0.5907,
2162
+ "step": 1775
2163
+ },
2164
+ {
2165
+ "epoch": 3.68,
2166
+ "learning_rate": 3.226432287002555e-05,
2167
+ "loss": 1.0195,
2168
+ "step": 1780
2169
+ },
2170
+ {
2171
+ "epoch": 3.69,
2172
+ "learning_rate": 3.417154047093731e-05,
2173
+ "loss": 0.5186,
2174
+ "step": 1785
2175
+ },
2176
+ {
2177
+ "epoch": 3.7,
2178
+ "learning_rate": 3.611501813044067e-05,
2179
+ "loss": 0.4889,
2180
+ "step": 1790
2181
+ },
2182
+ {
2183
+ "epoch": 3.71,
2184
+ "learning_rate": 3.8092708977579776e-05,
2185
+ "loss": 0.6288,
2186
+ "step": 1795
2187
+ },
2188
+ {
2189
+ "epoch": 3.72,
2190
+ "learning_rate": 4.010253010806977e-05,
2191
+ "loss": 0.7543,
2192
+ "step": 1800
2193
+ },
2194
+ {
2195
+ "epoch": 3.73,
2196
+ "learning_rate": 4.214236477799971e-05,
2197
+ "loss": 0.5513,
2198
+ "step": 1805
2199
+ },
2200
+ {
2201
+ "epoch": 3.74,
2202
+ "learning_rate": 4.421006463320525e-05,
2203
+ "loss": 0.6435,
2204
+ "step": 1810
2205
+ },
2206
+ {
2207
+ "epoch": 3.75,
2208
+ "learning_rate": 4.630345197188684e-05,
2209
+ "loss": 0.4894,
2210
+ "step": 1815
2211
+ },
2212
+ {
2213
+ "epoch": 3.76,
2214
+ "learning_rate": 4.8420322038201655e-05,
2215
+ "loss": 0.4861,
2216
+ "step": 1820
2217
+ },
2218
+ {
2219
+ "epoch": 3.77,
2220
+ "learning_rate": 5.0558445344283775e-05,
2221
+ "loss": 0.634,
2222
+ "step": 1825
2223
+ },
2224
+ {
2225
+ "epoch": 3.78,
2226
+ "learning_rate": 5.2715570018359465e-05,
2227
+ "loss": 0.5037,
2228
+ "step": 1830
2229
+ },
2230
+ {
2231
+ "epoch": 3.79,
2232
+ "learning_rate": 5.48894241764059e-05,
2233
+ "loss": 0.6078,
2234
+ "step": 1835
2235
+ },
2236
+ {
2237
+ "epoch": 3.8,
2238
+ "learning_rate": 5.70777183149052e-05,
2239
+ "loss": 0.7133,
2240
+ "step": 1840
2241
+ },
2242
+ {
2243
+ "epoch": 3.81,
2244
+ "learning_rate": 5.927814772215574e-05,
2245
+ "loss": 0.6985,
2246
+ "step": 1845
2247
+ },
2248
+ {
2249
+ "epoch": 3.82,
2250
+ "learning_rate": 6.148839490558139e-05,
2251
+ "loss": 0.6508,
2252
+ "step": 1850
2253
+ },
2254
+ {
2255
+ "epoch": 3.83,
2256
+ "learning_rate": 6.370613203253432e-05,
2257
+ "loss": 0.4598,
2258
+ "step": 1855
2259
+ },
2260
+ {
2261
+ "epoch": 3.84,
2262
+ "learning_rate": 6.592902338193773e-05,
2263
+ "loss": 0.428,
2264
+ "step": 1860
2265
+ },
2266
+ {
2267
+ "epoch": 3.85,
2268
+ "learning_rate": 6.815472780430632e-05,
2269
+ "loss": 0.5706,
2270
+ "step": 1865
2271
+ },
2272
+ {
2273
+ "epoch": 3.86,
2274
+ "learning_rate": 7.038090118741434e-05,
2275
+ "loss": 0.9235,
2276
+ "step": 1870
2277
+ },
2278
+ {
2279
+ "epoch": 3.87,
2280
+ "learning_rate": 7.260519892513427e-05,
2281
+ "loss": 0.8443,
2282
+ "step": 1875
2283
+ },
2284
+ {
2285
+ "epoch": 3.88,
2286
+ "learning_rate": 7.482527838676278e-05,
2287
+ "loss": 0.5691,
2288
+ "step": 1880
2289
+ },
2290
+ {
2291
+ "epoch": 3.89,
2292
+ "learning_rate": 7.703880138428551e-05,
2293
+ "loss": 0.4075,
2294
+ "step": 1885
2295
+ },
2296
+ {
2297
+ "epoch": 3.9,
2298
+ "learning_rate": 7.924343663496286e-05,
2299
+ "loss": 0.4282,
2300
+ "step": 1890
2301
+ },
2302
+ {
2303
+ "epoch": 3.92,
2304
+ "learning_rate": 8.14368622166238e-05,
2305
+ "loss": 0.6324,
2306
+ "step": 1895
2307
+ },
2308
+ {
2309
+ "epoch": 3.93,
2310
+ "learning_rate": 8.361676801313385e-05,
2311
+ "loss": 0.7816,
2312
+ "step": 1900
2313
+ },
2314
+ {
2315
+ "epoch": 3.94,
2316
+ "learning_rate": 8.578085814737905e-05,
2317
+ "loss": 0.6076,
2318
+ "step": 1905
2319
+ },
2320
+ {
2321
+ "epoch": 3.95,
2322
+ "learning_rate": 8.792685339932241e-05,
2323
+ "loss": 0.5257,
2324
+ "step": 1910
2325
+ },
2326
+ {
2327
+ "epoch": 3.96,
2328
+ "learning_rate": 9.005249360644818e-05,
2329
+ "loss": 0.6216,
2330
+ "step": 1915
2331
+ },
2332
+ {
2333
+ "epoch": 3.97,
2334
+ "learning_rate": 9.215554004418427e-05,
2335
+ "loss": 0.7805,
2336
+ "step": 1920
2337
+ },
2338
+ {
2339
+ "epoch": 3.98,
2340
+ "learning_rate": 9.423377778371423e-05,
2341
+ "loss": 0.7339,
2342
+ "step": 1925
2343
+ },
2344
+ {
2345
+ "epoch": 3.99,
2346
+ "learning_rate": 9.628501802474621e-05,
2347
+ "loss": 0.6319,
2348
+ "step": 1930
2349
+ },
2350
+ {
2351
+ "epoch": 4.0,
2352
+ "learning_rate": 9.830710040076315e-05,
2353
+ "loss": 0.5267,
2354
+ "step": 1935
2355
+ },
2356
+ {
2357
+ "epoch": 4.0,
2358
+ "eval_loss": 0.4826279878616333,
2359
+ "eval_runtime": 14.7153,
2360
+ "eval_samples_per_second": 44.716,
2361
+ "eval_steps_per_second": 5.64,
2362
+ "step": 1936
2363
  }
2364
  ],
2365
+ "max_steps": 1936,
2366
+ "num_train_epochs": 4,
2367
+ "total_flos": 2021355159552000.0,
2368
  "trial_name": null,
2369
  "trial_params": null
2370
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a191d7ddb4193d5d88e644c72ac19b8cc1f37c551be0aecf17ab07f676a5dc3
3
  size 3375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f78ff1353e48b556a427f64b800f3a4cfdb95bdf8553eb6558c9abb866bf9e
3
  size 3375