ZeroUniqueness commited on
Commit
cbbaa1a
Β·
1 Parent(s): a22bfcd

Training in progress, step 20000

Browse files
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb683e1ef26fb6759ee6f8f26fd71fa321318d9618b1721b67182a9ba22c4bed
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be1e03ed4e6123418bf0d0eb5aa75d959570f0aeb7b2fc39e2bb25599324a44b
3
  size 500897101
{checkpoint-16000 β†’ checkpoint-19000/adapter_model}/README.md RENAMED
File without changes
{checkpoint-16000 β†’ checkpoint-19000/adapter_model}/adapter_config.json RENAMED
@@ -14,13 +14,13 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
18
- "o_proj",
19
- "gate_proj",
20
- "k_proj",
21
  "up_proj",
22
  "down_proj",
23
- "q_proj"
 
 
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
 
 
 
 
17
  "up_proj",
18
  "down_proj",
19
+ "q_proj",
20
+ "v_proj",
21
+ "k_proj",
22
+ "gate_proj",
23
+ "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
{checkpoint-16000 β†’ checkpoint-19000/adapter_model}/adapter_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd4a02c01bc531e28def182521eea747ce903cb9327a80401dff2a70fc98f8af
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb683e1ef26fb6759ee6f8f26fd71fa321318d9618b1721b67182a9ba22c4bed
3
  size 500897101
{checkpoint-16000/adapter_model β†’ checkpoint-20000}/README.md RENAMED
File without changes
{checkpoint-16000/adapter_model β†’ checkpoint-20000}/adapter_config.json RENAMED
@@ -14,13 +14,13 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
18
- "o_proj",
19
- "gate_proj",
20
- "k_proj",
21
  "up_proj",
22
  "down_proj",
23
- "q_proj"
 
 
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
 
 
 
 
17
  "up_proj",
18
  "down_proj",
19
+ "q_proj",
20
+ "v_proj",
21
+ "k_proj",
22
+ "gate_proj",
23
+ "o_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
{checkpoint-16000/adapter_model β†’ checkpoint-20000}/adapter_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd4a02c01bc531e28def182521eea747ce903cb9327a80401dff2a70fc98f8af
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be1e03ed4e6123418bf0d0eb5aa75d959570f0aeb7b2fc39e2bb25599324a44b
3
  size 500897101
{checkpoint-16000 β†’ checkpoint-20000}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cfd954cd2c868cd11f0d8a538fd821ecb3761fb1f4987a97f38c9ef7ea6a56f
3
  size 1001723453
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:672e6e99ca3330c95b0226b42d37365aee4e2e1fd37665dc8a1baea4285d85ce
3
  size 1001723453
{checkpoint-16000 β†’ checkpoint-20000}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bcbc2923809b7d9828ee55561a29a28376a01d3ed10e710887a6f6606453114
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9379dd24f4bc7f42c02a2395c82177694b882887f5289666a7fc40b3c707700a
3
  size 14575
{checkpoint-16000 β†’ checkpoint-20000}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9d85d56cfe7deb3b0de257e3941e7b8f81cd78a2ac529b4bfef3c562e9e1d3e
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6540ccb807cc963275480e497de74768a480fbf46bd84a7c5eeaa24e5c7b2503
3
  size 627
{checkpoint-16000 β†’ checkpoint-20000}/trainer_state.json RENAMED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 0.6547831892967224,
3
- "best_model_checkpoint": "./qlora-out/checkpoint-16000",
4
- "epoch": 0.5965474814511017,
5
- "global_step": 16000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1094,11 +1094,283 @@
1094
  "eval_samples_per_second": 0.415,
1095
  "eval_steps_per_second": 0.415,
1096
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1097
  }
1098
  ],
1099
  "max_steps": 80463,
1100
  "num_train_epochs": 3,
1101
- "total_flos": 4.4837216723501875e+18,
1102
  "trial_name": null,
1103
  "trial_params": null
1104
  }
 
1
  {
2
+ "best_metric": 0.6324757933616638,
3
+ "best_model_checkpoint": "./qlora-out/checkpoint-20000",
4
+ "epoch": 0.7456843518138772,
5
+ "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1094
  "eval_samples_per_second": 0.415,
1095
  "eval_steps_per_second": 0.415,
1096
  "step": 16000
1097
+ },
1098
+ {
1099
+ "epoch": 0.6,
1100
+ "learning_rate": 0.00018090307655361701,
1101
+ "loss": 0.6354,
1102
+ "step": 16100
1103
+ },
1104
+ {
1105
+ "epoch": 0.6,
1106
+ "learning_rate": 0.00018067294467637228,
1107
+ "loss": 0.6349,
1108
+ "step": 16200
1109
+ },
1110
+ {
1111
+ "epoch": 0.61,
1112
+ "learning_rate": 0.00018044158269290054,
1113
+ "loss": 0.6127,
1114
+ "step": 16300
1115
+ },
1116
+ {
1117
+ "epoch": 0.61,
1118
+ "learning_rate": 0.00018020899413102412,
1119
+ "loss": 0.5977,
1120
+ "step": 16400
1121
+ },
1122
+ {
1123
+ "epoch": 0.62,
1124
+ "learning_rate": 0.00017997518253726834,
1125
+ "loss": 0.6213,
1126
+ "step": 16500
1127
+ },
1128
+ {
1129
+ "epoch": 0.62,
1130
+ "learning_rate": 0.00017974015147680734,
1131
+ "loss": 0.6168,
1132
+ "step": 16600
1133
+ },
1134
+ {
1135
+ "epoch": 0.62,
1136
+ "learning_rate": 0.00017950390453340978,
1137
+ "loss": 0.5978,
1138
+ "step": 16700
1139
+ },
1140
+ {
1141
+ "epoch": 0.63,
1142
+ "learning_rate": 0.0001792664453093842,
1143
+ "loss": 0.6201,
1144
+ "step": 16800
1145
+ },
1146
+ {
1147
+ "epoch": 0.63,
1148
+ "learning_rate": 0.000179027777425524,
1149
+ "loss": 0.6141,
1150
+ "step": 16900
1151
+ },
1152
+ {
1153
+ "epoch": 0.63,
1154
+ "learning_rate": 0.00017878790452105245,
1155
+ "loss": 0.6135,
1156
+ "step": 17000
1157
+ },
1158
+ {
1159
+ "epoch": 0.63,
1160
+ "eval_loss": 0.6480616927146912,
1161
+ "eval_runtime": 1347.9883,
1162
+ "eval_samples_per_second": 0.402,
1163
+ "eval_steps_per_second": 0.402,
1164
+ "step": 17000
1165
+ },
1166
+ {
1167
+ "epoch": 0.64,
1168
+ "learning_rate": 0.0001785468302535669,
1169
+ "loss": 0.6363,
1170
+ "step": 17100
1171
+ },
1172
+ {
1173
+ "epoch": 0.64,
1174
+ "learning_rate": 0.00017830455829898317,
1175
+ "loss": 0.6076,
1176
+ "step": 17200
1177
+ },
1178
+ {
1179
+ "epoch": 0.65,
1180
+ "learning_rate": 0.00017806109235147963,
1181
+ "loss": 0.609,
1182
+ "step": 17300
1183
+ },
1184
+ {
1185
+ "epoch": 0.65,
1186
+ "learning_rate": 0.00017781643612344058,
1187
+ "loss": 0.6044,
1188
+ "step": 17400
1189
+ },
1190
+ {
1191
+ "epoch": 0.65,
1192
+ "learning_rate": 0.00017757059334539994,
1193
+ "loss": 0.6262,
1194
+ "step": 17500
1195
+ },
1196
+ {
1197
+ "epoch": 0.66,
1198
+ "learning_rate": 0.00017732356776598403,
1199
+ "loss": 0.6195,
1200
+ "step": 17600
1201
+ },
1202
+ {
1203
+ "epoch": 0.66,
1204
+ "learning_rate": 0.0001770753631518548,
1205
+ "loss": 0.6328,
1206
+ "step": 17700
1207
+ },
1208
+ {
1209
+ "epoch": 0.66,
1210
+ "learning_rate": 0.000176825983287652,
1211
+ "loss": 0.6028,
1212
+ "step": 17800
1213
+ },
1214
+ {
1215
+ "epoch": 0.67,
1216
+ "learning_rate": 0.0001765754319759358,
1217
+ "loss": 0.6159,
1218
+ "step": 17900
1219
+ },
1220
+ {
1221
+ "epoch": 0.67,
1222
+ "learning_rate": 0.0001763237130371287,
1223
+ "loss": 0.6169,
1224
+ "step": 18000
1225
+ },
1226
+ {
1227
+ "epoch": 0.67,
1228
+ "eval_loss": 0.6444052457809448,
1229
+ "eval_runtime": 1304.3701,
1230
+ "eval_samples_per_second": 0.416,
1231
+ "eval_steps_per_second": 0.416,
1232
+ "step": 18000
1233
+ },
1234
+ {
1235
+ "epoch": 0.67,
1236
+ "learning_rate": 0.0001760708303094572,
1237
+ "loss": 0.6183,
1238
+ "step": 18100
1239
+ },
1240
+ {
1241
+ "epoch": 0.68,
1242
+ "learning_rate": 0.00017581678764889324,
1243
+ "loss": 0.6116,
1244
+ "step": 18200
1245
+ },
1246
+ {
1247
+ "epoch": 0.68,
1248
+ "learning_rate": 0.00017556158892909567,
1249
+ "loss": 0.6406,
1250
+ "step": 18300
1251
+ },
1252
+ {
1253
+ "epoch": 0.69,
1254
+ "learning_rate": 0.00017530523804135085,
1255
+ "loss": 0.6223,
1256
+ "step": 18400
1257
+ },
1258
+ {
1259
+ "epoch": 0.69,
1260
+ "learning_rate": 0.00017504773889451361,
1261
+ "loss": 0.628,
1262
+ "step": 18500
1263
+ },
1264
+ {
1265
+ "epoch": 0.69,
1266
+ "learning_rate": 0.00017478909541494736,
1267
+ "loss": 0.6173,
1268
+ "step": 18600
1269
+ },
1270
+ {
1271
+ "epoch": 0.7,
1272
+ "learning_rate": 0.00017452931154646444,
1273
+ "loss": 0.61,
1274
+ "step": 18700
1275
+ },
1276
+ {
1277
+ "epoch": 0.7,
1278
+ "learning_rate": 0.00017426839125026598,
1279
+ "loss": 0.5959,
1280
+ "step": 18800
1281
+ },
1282
+ {
1283
+ "epoch": 0.7,
1284
+ "learning_rate": 0.00017400633850488128,
1285
+ "loss": 0.5979,
1286
+ "step": 18900
1287
+ },
1288
+ {
1289
+ "epoch": 0.71,
1290
+ "learning_rate": 0.00017374315730610745,
1291
+ "loss": 0.6161,
1292
+ "step": 19000
1293
+ },
1294
+ {
1295
+ "epoch": 0.71,
1296
+ "eval_loss": 0.6378119587898254,
1297
+ "eval_runtime": 1283.5987,
1298
+ "eval_samples_per_second": 0.422,
1299
+ "eval_steps_per_second": 0.422,
1300
+ "step": 19000
1301
+ },
1302
+ {
1303
+ "epoch": 0.71,
1304
+ "learning_rate": 0.00017347885166694825,
1305
+ "loss": 0.6213,
1306
+ "step": 19100
1307
+ },
1308
+ {
1309
+ "epoch": 0.72,
1310
+ "learning_rate": 0.00017321342561755297,
1311
+ "loss": 0.6217,
1312
+ "step": 19200
1313
+ },
1314
+ {
1315
+ "epoch": 0.72,
1316
+ "learning_rate": 0.00017294688320515506,
1317
+ "loss": 0.6127,
1318
+ "step": 19300
1319
+ },
1320
+ {
1321
+ "epoch": 0.72,
1322
+ "learning_rate": 0.00017267922849401024,
1323
+ "loss": 0.6145,
1324
+ "step": 19400
1325
+ },
1326
+ {
1327
+ "epoch": 0.73,
1328
+ "learning_rate": 0.00017241046556533472,
1329
+ "loss": 0.5936,
1330
+ "step": 19500
1331
+ },
1332
+ {
1333
+ "epoch": 0.73,
1334
+ "learning_rate": 0.0001721405985172428,
1335
+ "loss": 0.6273,
1336
+ "step": 19600
1337
+ },
1338
+ {
1339
+ "epoch": 0.73,
1340
+ "learning_rate": 0.0001718696314646846,
1341
+ "loss": 0.6059,
1342
+ "step": 19700
1343
+ },
1344
+ {
1345
+ "epoch": 0.74,
1346
+ "learning_rate": 0.000171597568539383,
1347
+ "loss": 0.5934,
1348
+ "step": 19800
1349
+ },
1350
+ {
1351
+ "epoch": 0.74,
1352
+ "learning_rate": 0.000171324413889771,
1353
+ "loss": 0.6243,
1354
+ "step": 19900
1355
+ },
1356
+ {
1357
+ "epoch": 0.75,
1358
+ "learning_rate": 0.00017105017168092808,
1359
+ "loss": 0.6164,
1360
+ "step": 20000
1361
+ },
1362
+ {
1363
+ "epoch": 0.75,
1364
+ "eval_loss": 0.6324757933616638,
1365
+ "eval_runtime": 1266.6769,
1366
+ "eval_samples_per_second": 0.428,
1367
+ "eval_steps_per_second": 0.428,
1368
+ "step": 20000
1369
  }
1370
  ],
1371
  "max_steps": 80463,
1372
  "num_train_epochs": 3,
1373
+ "total_flos": 5.606446667012506e+18,
1374
  "trial_name": null,
1375
  "trial_params": null
1376
  }
{checkpoint-16000 β†’ checkpoint-20000}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6f52f1dc0e9f02b39b53daa1c87bbc62976c717096fc5d03aab7e139a51a837
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8ca8c55b410908f1a6fb4d78d55fe6aad82bbca76ec8021e18981496f18fa70
3
  size 4027