mikhail-panzo commited on
Commit
c928325
1 Parent(s): 948c462

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7a9ff8c6f099453e24c907ca48e18d629eeebd26b25e5d67dd78c92f08d9ec7
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cce8730ec7977cf6faded9c8a64710dd40c64e82312111fdae82ba0e37b6fe02
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7dbfca172321951aa3408c63e50180be9688454b3122fc35e2e279441fcd5a2
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d0bacbf128b4eb958fabc358356f531f65f6bd424a2a9be5add80fa30ab7cff
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc4bf9efeacd197a3755be03d6b9b32a091466296d5d750906cf5632d21aec65
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59b056016e351429f719aaf02cc6fa4544a2d92d2a3d69beeeb56674b12a1f2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9050220365c3d4317bfb9eee77e3abcc137e10ebf0d1b1a7e4370a6b88a28327
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a48d9034d2ce2771f0a840f8364e39645d6f213d3858f7f5342b741dc49975d2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.31707289814949036,
3
- "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-6500",
4
- "epoch": 10.890052356020943,
5
  "eval_steps": 500,
6
- "global_step": 6500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1021,6 +1021,84 @@
1021
  "eval_samples_per_second": 30.845,
1022
  "eval_steps_per_second": 3.859,
1023
  "step": 6500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
  }
1025
  ],
1026
  "logging_steps": 50,
@@ -1040,7 +1118,7 @@
1040
  "attributes": {}
1041
  }
1042
  },
1043
- "total_flos": 1.1645139306201984e+17,
1044
  "train_batch_size": 16,
1045
  "trial_name": null,
1046
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.31611359119415283,
3
+ "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-7000",
4
+ "epoch": 11.727748691099476,
5
  "eval_steps": 500,
6
+ "global_step": 7000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1021
  "eval_samples_per_second": 30.845,
1022
  "eval_steps_per_second": 3.859,
1023
  "step": 6500
1024
+ },
1025
+ {
1026
+ "epoch": 10.973821989528796,
1027
+ "grad_norm": 1.1960996389389038,
1028
+ "learning_rate": 2.4200000000000002e-05,
1029
+ "loss": 0.3478,
1030
+ "step": 6550
1031
+ },
1032
+ {
1033
+ "epoch": 11.057591623036648,
1034
+ "grad_norm": 0.97001713514328,
1035
+ "learning_rate": 2.3366666666666668e-05,
1036
+ "loss": 0.3475,
1037
+ "step": 6600
1038
+ },
1039
+ {
1040
+ "epoch": 11.141361256544503,
1041
+ "grad_norm": 1.1384519338607788,
1042
+ "learning_rate": 2.2533333333333333e-05,
1043
+ "loss": 0.3484,
1044
+ "step": 6650
1045
+ },
1046
+ {
1047
+ "epoch": 11.225130890052355,
1048
+ "grad_norm": 0.9649496078491211,
1049
+ "learning_rate": 2.1700000000000002e-05,
1050
+ "loss": 0.3454,
1051
+ "step": 6700
1052
+ },
1053
+ {
1054
+ "epoch": 11.30890052356021,
1055
+ "grad_norm": 1.0407809019088745,
1056
+ "learning_rate": 2.0866666666666668e-05,
1057
+ "loss": 0.3446,
1058
+ "step": 6750
1059
+ },
1060
+ {
1061
+ "epoch": 11.392670157068062,
1062
+ "grad_norm": 1.087108850479126,
1063
+ "learning_rate": 2.0033333333333334e-05,
1064
+ "loss": 0.3475,
1065
+ "step": 6800
1066
+ },
1067
+ {
1068
+ "epoch": 11.476439790575917,
1069
+ "grad_norm": 0.8870049715042114,
1070
+ "learning_rate": 1.9200000000000003e-05,
1071
+ "loss": 0.3454,
1072
+ "step": 6850
1073
+ },
1074
+ {
1075
+ "epoch": 11.56020942408377,
1076
+ "grad_norm": 1.0377373695373535,
1077
+ "learning_rate": 1.8366666666666668e-05,
1078
+ "loss": 0.3447,
1079
+ "step": 6900
1080
+ },
1081
+ {
1082
+ "epoch": 11.643979057591624,
1083
+ "grad_norm": 1.138604760169983,
1084
+ "learning_rate": 1.7533333333333334e-05,
1085
+ "loss": 0.345,
1086
+ "step": 6950
1087
+ },
1088
+ {
1089
+ "epoch": 11.727748691099476,
1090
+ "grad_norm": 1.6464053392410278,
1091
+ "learning_rate": 1.6700000000000003e-05,
1092
+ "loss": 0.3465,
1093
+ "step": 7000
1094
+ },
1095
+ {
1096
+ "epoch": 11.727748691099476,
1097
+ "eval_loss": 0.31611359119415283,
1098
+ "eval_runtime": 271.5426,
1099
+ "eval_samples_per_second": 31.262,
1100
+ "eval_steps_per_second": 3.911,
1101
+ "step": 7000
1102
  }
1103
  ],
1104
  "logging_steps": 50,
 
1118
  "attributes": {}
1119
  }
1120
  },
1121
+ "total_flos": 1.2541009593096864e+17,
1122
  "train_batch_size": 16,
1123
  "trial_name": null,
1124
  "trial_params": null