Bingsu commited on
Commit
f1f4522
1 Parent(s): 809d256

Training in progress, step 70000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ceedefb856fd84795e75aee417a2889e7dfef00f9cca82e610fbafac5203514
3
- size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eb55ec1aacbfeebe119e515b71d01fac6a80c1dc916333ed52358ff9464626e
3
+ size 100172997
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c51f436fc6ff7c66c8286fca81fd6d00dc485176f29ebe17de85db28a4fa91b5
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e319b3e22d458ba27ff2a2eb8537fae27cd2f8bcba6cd5bc802fb4266dab1c01
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff9803e49c54da5b93ea63a8f9cfb55e640978474df5d52e215ba5da04a71f90
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b63f7cf635c5cd7e0a6a99be90b9c9040bc4b142713e70d6ed808fdd72cc930
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e429486456e317e2d30183574218e6d221698c823284eb9740704ef563e5d5d
3
- size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cce12b461956f7f82f9c60078f067ba7f5af96b281245752bc9e8d8eb78bb3a
3
+ size 246899880
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2578427159432746,
5
- "global_step": 60000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1806,11 +1806,311 @@
1806
  "learning_rate": 0.0003716756671558975,
1807
  "loss": 3.3157,
1808
  "step": 60000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1809
  }
1810
  ],
1811
  "max_steps": 500000,
1812
  "num_train_epochs": 3,
1813
- "total_flos": 9.562979893248e+16,
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.30081650193382037,
5
+ "global_step": 70000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1806
  "learning_rate": 0.0003716756671558975,
1807
  "loss": 3.3157,
1808
  "step": 60000
1809
+ },
1810
+ {
1811
+ "epoch": 0.26,
1812
+ "learning_rate": 0.0003735891933646703,
1813
+ "loss": 3.2991,
1814
+ "step": 60200
1815
+ },
1816
+ {
1817
+ "epoch": 0.26,
1818
+ "learning_rate": 0.00037550528852259106,
1819
+ "loss": 3.2628,
1820
+ "step": 60400
1821
+ },
1822
+ {
1823
+ "epoch": 0.26,
1824
+ "learning_rate": 0.00037742391900952516,
1825
+ "loss": 3.2928,
1826
+ "step": 60600
1827
+ },
1828
+ {
1829
+ "epoch": 0.26,
1830
+ "learning_rate": 0.0003793450511608526,
1831
+ "loss": 3.2828,
1832
+ "step": 60800
1833
+ },
1834
+ {
1835
+ "epoch": 0.26,
1836
+ "learning_rate": 0.00038126865126805905,
1837
+ "loss": 3.2863,
1838
+ "step": 61000
1839
+ },
1840
+ {
1841
+ "epoch": 0.26,
1842
+ "learning_rate": 0.0003831946855793267,
1843
+ "loss": 3.2951,
1844
+ "step": 61200
1845
+ },
1846
+ {
1847
+ "epoch": 0.26,
1848
+ "learning_rate": 0.00038512312030012676,
1849
+ "loss": 3.2583,
1850
+ "step": 61400
1851
+ },
1852
+ {
1853
+ "epoch": 0.26,
1854
+ "learning_rate": 0.0003870539215938128,
1855
+ "loss": 3.2872,
1856
+ "step": 61600
1857
+ },
1858
+ {
1859
+ "epoch": 0.27,
1860
+ "learning_rate": 0.00038898705558221367,
1861
+ "loss": 3.2748,
1862
+ "step": 61800
1863
+ },
1864
+ {
1865
+ "epoch": 0.27,
1866
+ "learning_rate": 0.00039092248834622883,
1867
+ "loss": 3.3026,
1868
+ "step": 62000
1869
+ },
1870
+ {
1871
+ "epoch": 0.27,
1872
+ "learning_rate": 0.00039286018592642224,
1873
+ "loss": 3.2734,
1874
+ "step": 62200
1875
+ },
1876
+ {
1877
+ "epoch": 0.27,
1878
+ "learning_rate": 0.00039480011432362007,
1879
+ "loss": 3.2849,
1880
+ "step": 62400
1881
+ },
1882
+ {
1883
+ "epoch": 0.27,
1884
+ "learning_rate": 0.00039674223949950514,
1885
+ "loss": 3.2889,
1886
+ "step": 62600
1887
+ },
1888
+ {
1889
+ "epoch": 0.27,
1890
+ "learning_rate": 0.0003986865273772159,
1891
+ "loss": 3.2938,
1892
+ "step": 62800
1893
+ },
1894
+ {
1895
+ "epoch": 0.27,
1896
+ "learning_rate": 0.00040063294384194367,
1897
+ "loss": 3.2755,
1898
+ "step": 63000
1899
+ },
1900
+ {
1901
+ "epoch": 0.27,
1902
+ "learning_rate": 0.0004025814547415307,
1903
+ "loss": 3.2774,
1904
+ "step": 63200
1905
+ },
1906
+ {
1907
+ "epoch": 0.27,
1908
+ "learning_rate": 0.00040453202588707036,
1909
+ "loss": 3.2819,
1910
+ "step": 63400
1911
+ },
1912
+ {
1913
+ "epoch": 0.27,
1914
+ "learning_rate": 0.0004064846230535067,
1915
+ "loss": 3.2867,
1916
+ "step": 63600
1917
+ },
1918
+ {
1919
+ "epoch": 0.27,
1920
+ "learning_rate": 0.00040843921198023417,
1921
+ "loss": 3.2856,
1922
+ "step": 63800
1923
+ },
1924
+ {
1925
+ "epoch": 0.28,
1926
+ "learning_rate": 0.0004103957583717001,
1927
+ "loss": 3.2932,
1928
+ "step": 64000
1929
+ },
1930
+ {
1931
+ "epoch": 0.28,
1932
+ "learning_rate": 0.0004123542278980058,
1933
+ "loss": 3.272,
1934
+ "step": 64200
1935
+ },
1936
+ {
1937
+ "epoch": 0.28,
1938
+ "learning_rate": 0.00041431458619550874,
1939
+ "loss": 3.2697,
1940
+ "step": 64400
1941
+ },
1942
+ {
1943
+ "epoch": 0.28,
1944
+ "learning_rate": 0.00041627679886742527,
1945
+ "loss": 3.276,
1946
+ "step": 64600
1947
+ },
1948
+ {
1949
+ "epoch": 0.28,
1950
+ "learning_rate": 0.0004182408314844355,
1951
+ "loss": 3.2732,
1952
+ "step": 64800
1953
+ },
1954
+ {
1955
+ "epoch": 0.28,
1956
+ "learning_rate": 0.00042020664958528574,
1957
+ "loss": 3.2535,
1958
+ "step": 65000
1959
+ },
1960
+ {
1961
+ "epoch": 0.28,
1962
+ "learning_rate": 0.0004221742186773941,
1963
+ "loss": 3.2791,
1964
+ "step": 65200
1965
+ },
1966
+ {
1967
+ "epoch": 0.28,
1968
+ "learning_rate": 0.0004241435042374555,
1969
+ "loss": 3.2718,
1970
+ "step": 65400
1971
+ },
1972
+ {
1973
+ "epoch": 0.28,
1974
+ "learning_rate": 0.0004261144717120477,
1975
+ "loss": 3.2672,
1976
+ "step": 65600
1977
+ },
1978
+ {
1979
+ "epoch": 0.28,
1980
+ "learning_rate": 0.00042808708651823654,
1981
+ "loss": 3.2555,
1982
+ "step": 65800
1983
+ },
1984
+ {
1985
+ "epoch": 0.28,
1986
+ "learning_rate": 0.00043006131404418424,
1987
+ "loss": 3.2717,
1988
+ "step": 66000
1989
+ },
1990
+ {
1991
+ "epoch": 0.28,
1992
+ "learning_rate": 0.00043203711964975595,
1993
+ "loss": 3.2497,
1994
+ "step": 66200
1995
+ },
1996
+ {
1997
+ "epoch": 0.29,
1998
+ "learning_rate": 0.00043401446866712684,
1999
+ "loss": 3.2525,
2000
+ "step": 66400
2001
+ },
2002
+ {
2003
+ "epoch": 0.29,
2004
+ "learning_rate": 0.000435993326401392,
2005
+ "loss": 3.2523,
2006
+ "step": 66600
2007
+ },
2008
+ {
2009
+ "epoch": 0.29,
2010
+ "learning_rate": 0.0004379736581311737,
2011
+ "loss": 3.2431,
2012
+ "step": 66800
2013
+ },
2014
+ {
2015
+ "epoch": 0.29,
2016
+ "learning_rate": 0.00043995542910923167,
2017
+ "loss": 3.2571,
2018
+ "step": 67000
2019
+ },
2020
+ {
2021
+ "epoch": 0.29,
2022
+ "learning_rate": 0.0004419386045630716,
2023
+ "loss": 3.2445,
2024
+ "step": 67200
2025
+ },
2026
+ {
2027
+ "epoch": 0.29,
2028
+ "learning_rate": 0.0004439231496955571,
2029
+ "loss": 3.2508,
2030
+ "step": 67400
2031
+ },
2032
+ {
2033
+ "epoch": 0.29,
2034
+ "learning_rate": 0.00044590902968551834,
2035
+ "loss": 3.2435,
2036
+ "step": 67600
2037
+ },
2038
+ {
2039
+ "epoch": 0.29,
2040
+ "learning_rate": 0.00044789620968836404,
2041
+ "loss": 3.2436,
2042
+ "step": 67800
2043
+ },
2044
+ {
2045
+ "epoch": 0.29,
2046
+ "learning_rate": 0.0004498846548366927,
2047
+ "loss": 3.2533,
2048
+ "step": 68000
2049
+ },
2050
+ {
2051
+ "epoch": 0.29,
2052
+ "learning_rate": 0.0004518743302409047,
2053
+ "loss": 3.2523,
2054
+ "step": 68200
2055
+ },
2056
+ {
2057
+ "epoch": 0.29,
2058
+ "learning_rate": 0.00045386520098981335,
2059
+ "loss": 3.2284,
2060
+ "step": 68400
2061
+ },
2062
+ {
2063
+ "epoch": 0.29,
2064
+ "learning_rate": 0.0004558572321512592,
2065
+ "loss": 3.2285,
2066
+ "step": 68600
2067
+ },
2068
+ {
2069
+ "epoch": 0.3,
2070
+ "learning_rate": 0.00045785038877272114,
2071
+ "loss": 3.2395,
2072
+ "step": 68800
2073
+ },
2074
+ {
2075
+ "epoch": 0.3,
2076
+ "learning_rate": 0.00045984463588193104,
2077
+ "loss": 3.2592,
2078
+ "step": 69000
2079
+ },
2080
+ {
2081
+ "epoch": 0.3,
2082
+ "learning_rate": 0.00046183993848748675,
2083
+ "loss": 3.2478,
2084
+ "step": 69200
2085
+ },
2086
+ {
2087
+ "epoch": 0.3,
2088
+ "learning_rate": 0.0004638362615794662,
2089
+ "loss": 3.2303,
2090
+ "step": 69400
2091
+ },
2092
+ {
2093
+ "epoch": 0.3,
2094
+ "learning_rate": 0.00046583357013004194,
2095
+ "loss": 3.242,
2096
+ "step": 69600
2097
+ },
2098
+ {
2099
+ "epoch": 0.3,
2100
+ "learning_rate": 0.00046783182909409496,
2101
+ "loss": 3.2582,
2102
+ "step": 69800
2103
+ },
2104
+ {
2105
+ "epoch": 0.3,
2106
+ "learning_rate": 0.00046983100340983056,
2107
+ "loss": 3.2461,
2108
+ "step": 70000
2109
  }
2110
  ],
2111
  "max_steps": 500000,
2112
  "num_train_epochs": 3,
2113
+ "total_flos": 1.1156809875456e+17,
2114
  "trial_name": null,
2115
  "trial_params": null
2116
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c51f436fc6ff7c66c8286fca81fd6d00dc485176f29ebe17de85db28a4fa91b5
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e319b3e22d458ba27ff2a2eb8537fae27cd2f8bcba6cd5bc802fb4266dab1c01
3
  size 146774203