mikhail-panzo commited on
Commit
5b6e060
1 Parent(s): bbe8e25

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd6982ff60e76681cdea96533f303047c7f9b43cd64cfdb19cf0a9f1d94ff80e
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33eb734ff3c6509d7cf8201446914cc165199f2b27d28f8526dd06f36c4c3a3
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1316bfa6dd7cc4e4916a4fe92782056a65d891c7aa55ec7fda87d7140ddb5690
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1be8b22777ae310af2ce894c83ae3165caa437666efbe62ef81d813a59a34364
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc4bf9efeacd197a3755be03d6b9b32a091466296d5d750906cf5632d21aec65
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59b056016e351429f719aaf02cc6fa4544a2d92d2a3d69beeeb56674b12a1f2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:935e716e26427b50d58301a89c5ac51882cf7f1f1087c9e92a9aedc9583a88dd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79faef7859961c2a638e4312a796703fd0c86e5877b740a2ce8b47db225af025
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.36852771043777466,
3
- "best_model_checkpoint": "mikhail-panzo/zlm_b128_le5_s8000/checkpoint-6500",
4
- "epoch": 10.890052356020943,
5
  "eval_steps": 500,
6
- "global_step": 6500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1021,6 +1021,84 @@
1021
  "eval_samples_per_second": 33.265,
1022
  "eval_steps_per_second": 4.162,
1023
  "step": 6500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
  }
1025
  ],
1026
  "logging_steps": 50,
@@ -1040,7 +1118,7 @@
1040
  "attributes": {}
1041
  }
1042
  },
1043
- "total_flos": 1.1645139306201984e+17,
1044
  "train_batch_size": 16,
1045
  "trial_name": null,
1046
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3675082325935364,
3
+ "best_model_checkpoint": "mikhail-panzo/zlm_b128_le5_s8000/checkpoint-7000",
4
+ "epoch": 11.727748691099476,
5
  "eval_steps": 500,
6
+ "global_step": 7000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1021
  "eval_samples_per_second": 33.265,
1022
  "eval_steps_per_second": 4.162,
1023
  "step": 6500
1024
+ },
1025
+ {
1026
+ "epoch": 10.973821989528796,
1027
+ "grad_norm": 1.0459740161895752,
1028
+ "learning_rate": 2.421666666666667e-06,
1029
+ "loss": 0.4074,
1030
+ "step": 6550
1031
+ },
1032
+ {
1033
+ "epoch": 11.057591623036648,
1034
+ "grad_norm": 0.9029247760772705,
1035
+ "learning_rate": 2.3383333333333335e-06,
1036
+ "loss": 0.4075,
1037
+ "step": 6600
1038
+ },
1039
+ {
1040
+ "epoch": 11.141361256544503,
1041
+ "grad_norm": 1.5372889041900635,
1042
+ "learning_rate": 2.2550000000000004e-06,
1043
+ "loss": 0.4088,
1044
+ "step": 6650
1045
+ },
1046
+ {
1047
+ "epoch": 11.225130890052355,
1048
+ "grad_norm": 0.9959379434585571,
1049
+ "learning_rate": 2.171666666666667e-06,
1050
+ "loss": 0.4044,
1051
+ "step": 6700
1052
+ },
1053
+ {
1054
+ "epoch": 11.30890052356021,
1055
+ "grad_norm": 1.3793728351593018,
1056
+ "learning_rate": 2.088333333333334e-06,
1057
+ "loss": 0.4034,
1058
+ "step": 6750
1059
+ },
1060
+ {
1061
+ "epoch": 11.392670157068062,
1062
+ "grad_norm": 1.2086491584777832,
1063
+ "learning_rate": 2.0050000000000003e-06,
1064
+ "loss": 0.4073,
1065
+ "step": 6800
1066
+ },
1067
+ {
1068
+ "epoch": 11.476439790575917,
1069
+ "grad_norm": 1.07647705078125,
1070
+ "learning_rate": 1.9216666666666668e-06,
1071
+ "loss": 0.405,
1072
+ "step": 6850
1073
+ },
1074
+ {
1075
+ "epoch": 11.56020942408377,
1076
+ "grad_norm": 0.9849846363067627,
1077
+ "learning_rate": 1.8383333333333334e-06,
1078
+ "loss": 0.4037,
1079
+ "step": 6900
1080
+ },
1081
+ {
1082
+ "epoch": 11.643979057591624,
1083
+ "grad_norm": 1.2623456716537476,
1084
+ "learning_rate": 1.7550000000000001e-06,
1085
+ "loss": 0.4042,
1086
+ "step": 6950
1087
+ },
1088
+ {
1089
+ "epoch": 11.727748691099476,
1090
+ "grad_norm": 0.9488279819488525,
1091
+ "learning_rate": 1.6716666666666666e-06,
1092
+ "loss": 0.4069,
1093
+ "step": 7000
1094
+ },
1095
+ {
1096
+ "epoch": 11.727748691099476,
1097
+ "eval_loss": 0.3675082325935364,
1098
+ "eval_runtime": 257.7947,
1099
+ "eval_samples_per_second": 32.929,
1100
+ "eval_steps_per_second": 4.12,
1101
+ "step": 7000
1102
  }
1103
  ],
1104
  "logging_steps": 50,
 
1118
  "attributes": {}
1119
  }
1120
  },
1121
+ "total_flos": 1.2541009593096864e+17,
1122
  "train_batch_size": 16,
1123
  "trial_name": null,
1124
  "trial_params": null