mikhail-panzo commited on
Commit
25225e5
1 Parent(s): f746706

Training in progress, step 6500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d2b3399969fb74960ee01ffd897aef3b9bf5e7209f6f0ea877a12153f361408
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7a9ff8c6f099453e24c907ca48e18d629eeebd26b25e5d67dd78c92f08d9ec7
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7746ae2b28b16ee5e4dc0d43c7ed500663b9d82298a7c15d1500a39d65d9d274
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7dbfca172321951aa3408c63e50180be9688454b3122fc35e2e279441fcd5a2
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b78a35397517539ceb5abaec4c078472043c61c90e9313f43ee762be5908798
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc4bf9efeacd197a3755be03d6b9b32a091466296d5d750906cf5632d21aec65
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cc3a4f205edd4add2b834d8b0c7057cd9c53044c257143116c01897f3cb6067
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9050220365c3d4317bfb9eee77e3abcc137e10ebf0d1b1a7e4370a6b88a28327
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.3189197778701782,
3
- "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-6000",
4
- "epoch": 10.052356020942408,
5
  "eval_steps": 500,
6
- "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -943,6 +943,84 @@
943
  "eval_samples_per_second": 31.055,
944
  "eval_steps_per_second": 3.885,
945
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
946
  }
947
  ],
948
  "logging_steps": 50,
@@ -962,7 +1040,7 @@
962
  "attributes": {}
963
  }
964
  },
965
- "total_flos": 1.0748974547355264e+17,
966
  "train_batch_size": 16,
967
  "trial_name": null,
968
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.31707289814949036,
3
+ "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-6500",
4
+ "epoch": 10.890052356020943,
5
  "eval_steps": 500,
6
+ "global_step": 6500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
943
  "eval_samples_per_second": 31.055,
944
  "eval_steps_per_second": 3.885,
945
  "step": 6000
946
+ },
947
+ {
948
+ "epoch": 10.136125654450261,
949
+ "grad_norm": 1.572178840637207,
950
+ "learning_rate": 3.253333333333333e-05,
951
+ "loss": 0.3492,
952
+ "step": 6050
953
+ },
954
+ {
955
+ "epoch": 10.219895287958115,
956
+ "grad_norm": 1.063959002494812,
957
+ "learning_rate": 3.1700000000000005e-05,
958
+ "loss": 0.3525,
959
+ "step": 6100
960
+ },
961
+ {
962
+ "epoch": 10.303664921465968,
963
+ "grad_norm": 1.1579703092575073,
964
+ "learning_rate": 3.086666666666667e-05,
965
+ "loss": 0.3486,
966
+ "step": 6150
967
+ },
968
+ {
969
+ "epoch": 10.387434554973822,
970
+ "grad_norm": 1.4260714054107666,
971
+ "learning_rate": 3.0033333333333336e-05,
972
+ "loss": 0.3483,
973
+ "step": 6200
974
+ },
975
+ {
976
+ "epoch": 10.471204188481675,
977
+ "grad_norm": 1.453321099281311,
978
+ "learning_rate": 2.9199999999999998e-05,
979
+ "loss": 0.3481,
980
+ "step": 6250
981
+ },
982
+ {
983
+ "epoch": 10.55497382198953,
984
+ "grad_norm": 1.8545498847961426,
985
+ "learning_rate": 2.836666666666667e-05,
986
+ "loss": 0.3482,
987
+ "step": 6300
988
+ },
989
+ {
990
+ "epoch": 10.638743455497382,
991
+ "grad_norm": 1.073957920074463,
992
+ "learning_rate": 2.7533333333333333e-05,
993
+ "loss": 0.348,
994
+ "step": 6350
995
+ },
996
+ {
997
+ "epoch": 10.722513089005236,
998
+ "grad_norm": 1.0049316883087158,
999
+ "learning_rate": 2.6700000000000002e-05,
1000
+ "loss": 0.3487,
1001
+ "step": 6400
1002
+ },
1003
+ {
1004
+ "epoch": 10.806282722513089,
1005
+ "grad_norm": 1.4970500469207764,
1006
+ "learning_rate": 2.5866666666666667e-05,
1007
+ "loss": 0.3468,
1008
+ "step": 6450
1009
+ },
1010
+ {
1011
+ "epoch": 10.890052356020943,
1012
+ "grad_norm": 1.6566526889801025,
1013
+ "learning_rate": 2.5033333333333336e-05,
1014
+ "loss": 0.3469,
1015
+ "step": 6500
1016
+ },
1017
+ {
1018
+ "epoch": 10.890052356020943,
1019
+ "eval_loss": 0.31707289814949036,
1020
+ "eval_runtime": 275.2143,
1021
+ "eval_samples_per_second": 30.845,
1022
+ "eval_steps_per_second": 3.859,
1023
+ "step": 6500
1024
  }
1025
  ],
1026
  "logging_steps": 50,
 
1040
  "attributes": {}
1041
  }
1042
  },
1043
+ "total_flos": 1.1645139306201984e+17,
1044
  "train_batch_size": 16,
1045
  "trial_name": null,
1046
  "trial_params": null