mikhail-panzo commited on
Commit
9de99a3
1 Parent(s): 361bb40

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f4bc91b2bf1d4a600334b1faa106af9988bbfd956f7192f84cc46e78a068110
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56a801f7874b64e033e7d65093ff6a670b591b9e352bb55a9efedb82bb7bd081
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:064a18e6c4d9e599dfb3d70c36124b53d38c2edc52983709d3ae7adeb24da2e8
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d15e5170813dcff285d169be8481933a4cb0c9a73034315f866ed759fa591047
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2719dc48cfd7b0c8e47595f5673050b1706a0337b7156e3973f160d025717221
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23d7f02c1a1a9aea0c08b9ee9ea167031247ef6b9121df34e7f784a8ef6bb970
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca290528005f495cfd7077923dce176b5d944942c2e6ddb59ada2a96248ccd53
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2891075898e42caf114b73a9c8652d14f8fed0b39122b829f8ae7578e63f2a0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.3699803650379181,
3
  "best_model_checkpoint": "mikhail-panzo/zlm_b32_le5_s12000/checkpoint-11500",
4
- "epoch": 4.816753926701571,
5
  "eval_steps": 500,
6
- "global_step": 11500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1801,6 +1801,84 @@
1801
  "eval_samples_per_second": 28.557,
1802
  "eval_steps_per_second": 3.573,
1803
  "step": 11500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1804
  }
1805
  ],
1806
  "logging_steps": 50,
@@ -1815,12 +1893,12 @@
1815
  "should_evaluate": false,
1816
  "should_log": false,
1817
  "should_save": true,
1818
- "should_training_stop": false
1819
  },
1820
  "attributes": {}
1821
  }
1822
  },
1823
- "total_flos": 5.149805174423712e+16,
1824
  "train_batch_size": 16,
1825
  "trial_name": null,
1826
  "trial_params": null
 
1
  {
2
  "best_metric": 0.3699803650379181,
3
  "best_model_checkpoint": "mikhail-panzo/zlm_b32_le5_s12000/checkpoint-11500",
4
+ "epoch": 5.026178010471204,
5
  "eval_steps": 500,
6
+ "global_step": 12000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1801
  "eval_samples_per_second": 28.557,
1802
  "eval_steps_per_second": 3.573,
1803
  "step": 11500
1804
+ },
1805
+ {
1806
+ "epoch": 4.837696335078534,
1807
+ "grad_norm": 1.7664463520050049,
1808
+ "learning_rate": 4.5500000000000004e-07,
1809
+ "loss": 0.4102,
1810
+ "step": 11550
1811
+ },
1812
+ {
1813
+ "epoch": 4.858638743455497,
1814
+ "grad_norm": 2.0069000720977783,
1815
+ "learning_rate": 4.0500000000000004e-07,
1816
+ "loss": 0.4026,
1817
+ "step": 11600
1818
+ },
1819
+ {
1820
+ "epoch": 4.879581151832461,
1821
+ "grad_norm": 1.475438117980957,
1822
+ "learning_rate": 3.55e-07,
1823
+ "loss": 0.4132,
1824
+ "step": 11650
1825
+ },
1826
+ {
1827
+ "epoch": 4.900523560209424,
1828
+ "grad_norm": 1.6426637172698975,
1829
+ "learning_rate": 3.0500000000000004e-07,
1830
+ "loss": 0.4082,
1831
+ "step": 11700
1832
+ },
1833
+ {
1834
+ "epoch": 4.9214659685863875,
1835
+ "grad_norm": 1.6717259883880615,
1836
+ "learning_rate": 2.55e-07,
1837
+ "loss": 0.4115,
1838
+ "step": 11750
1839
+ },
1840
+ {
1841
+ "epoch": 4.942408376963351,
1842
+ "grad_norm": 1.9856605529785156,
1843
+ "learning_rate": 2.0500000000000002e-07,
1844
+ "loss": 0.4064,
1845
+ "step": 11800
1846
+ },
1847
+ {
1848
+ "epoch": 4.963350785340314,
1849
+ "grad_norm": 2.535362958908081,
1850
+ "learning_rate": 1.5500000000000002e-07,
1851
+ "loss": 0.4126,
1852
+ "step": 11850
1853
+ },
1854
+ {
1855
+ "epoch": 4.984293193717278,
1856
+ "grad_norm": 1.6207140684127808,
1857
+ "learning_rate": 1.0500000000000001e-07,
1858
+ "loss": 0.4139,
1859
+ "step": 11900
1860
+ },
1861
+ {
1862
+ "epoch": 5.005235602094241,
1863
+ "grad_norm": 1.4156582355499268,
1864
+ "learning_rate": 5.5e-08,
1865
+ "loss": 0.409,
1866
+ "step": 11950
1867
+ },
1868
+ {
1869
+ "epoch": 5.026178010471204,
1870
+ "grad_norm": 3.40710186958313,
1871
+ "learning_rate": 5e-09,
1872
+ "loss": 0.4088,
1873
+ "step": 12000
1874
+ },
1875
+ {
1876
+ "epoch": 5.026178010471204,
1877
+ "eval_loss": 0.3706651031970978,
1878
+ "eval_runtime": 299.1328,
1879
+ "eval_samples_per_second": 28.379,
1880
+ "eval_steps_per_second": 3.55,
1881
+ "step": 12000
1882
  }
1883
  ],
1884
  "logging_steps": 50,
 
1893
  "should_evaluate": false,
1894
  "should_log": false,
1895
  "should_save": true,
1896
+ "should_training_stop": true
1897
  },
1898
  "attributes": {}
1899
  }
1900
  },
1901
+ "total_flos": 5.37491824204009e+16,
1902
  "train_batch_size": 16,
1903
  "trial_name": null,
1904
  "trial_params": null