kiddothe2b commited on
Commit
0fed369
1 Parent(s): 5d8ab3b

Training in progress, step 25600

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d041fc9e6d55c4a7915f0599d0972686813610cfd5a2d83bd76580f7087c5ca
3
  size 745634697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a82d4998e15048f7276ff7bf21cf172b2b8f99b8e3bce01b447dd4dc2e0f4219
3
  size 745634697
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7a084a9b3d69be038d1f70310127204c769b6f31132335d3c43f2359a442b86
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85efd468f59e090bda67c9e694bf55407f51a1a6d9bede51d725c6b288ff9330
3
  size 372832803
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac85050f1de5a3da93b15d68ec19d08f9c128973d47940d52332ce7a8a430098
3
  size 15523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5885efec8b7366a4aa17af5e032d3298449da4e1fd163c7c8437f60c984450c3
3
  size 15523
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12a6154fa53f0286557ec7a9b6bf6b9f5b2fb01f4345510fa7b96c5e44005857
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac4294ae0275bdf2fd072eb3d13fea356c3c27e1570dc0dcf8759f2decf14230
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3,
5
- "global_step": 19200,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1185,11 +1185,404 @@
1185
  "eval_samples_per_second": 45.96,
1186
  "eval_steps_per_second": 2.873,
1187
  "step": 19200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
  }
1189
  ],
1190
  "max_steps": 64000,
1191
  "num_train_epochs": 9223372036854775807,
1192
- "total_flos": 1.015475293126656e+17,
1193
  "trial_name": null,
1194
  "trial_params": null
1195
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4,
5
+ "global_step": 25600,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1185
  "eval_samples_per_second": 45.96,
1186
  "eval_steps_per_second": 2.873,
1187
  "step": 19200
1188
+ },
1189
+ {
1190
+ "epoch": 0.3,
1191
+ "learning_rate": 0.001,
1192
+ "loss": 8.043,
1193
+ "step": 19300
1194
+ },
1195
+ {
1196
+ "epoch": 0.3,
1197
+ "learning_rate": 0.001,
1198
+ "loss": 8.0459,
1199
+ "step": 19400
1200
+ },
1201
+ {
1202
+ "epoch": 0.3,
1203
+ "learning_rate": 0.001,
1204
+ "loss": 8.0138,
1205
+ "step": 19500
1206
+ },
1207
+ {
1208
+ "epoch": 0.31,
1209
+ "learning_rate": 0.001,
1210
+ "loss": 8.0321,
1211
+ "step": 19600
1212
+ },
1213
+ {
1214
+ "epoch": 0.31,
1215
+ "learning_rate": 0.001,
1216
+ "loss": 8.0512,
1217
+ "step": 19700
1218
+ },
1219
+ {
1220
+ "epoch": 0.31,
1221
+ "learning_rate": 0.001,
1222
+ "loss": 8.0284,
1223
+ "step": 19800
1224
+ },
1225
+ {
1226
+ "epoch": 0.31,
1227
+ "learning_rate": 0.001,
1228
+ "loss": 8.0347,
1229
+ "step": 19900
1230
+ },
1231
+ {
1232
+ "epoch": 0.31,
1233
+ "learning_rate": 0.001,
1234
+ "loss": 8.046,
1235
+ "step": 20000
1236
+ },
1237
+ {
1238
+ "epoch": 0.31,
1239
+ "learning_rate": 0.001,
1240
+ "loss": 8.0466,
1241
+ "step": 20100
1242
+ },
1243
+ {
1244
+ "epoch": 0.32,
1245
+ "learning_rate": 0.001,
1246
+ "loss": 8.0612,
1247
+ "step": 20200
1248
+ },
1249
+ {
1250
+ "epoch": 0.32,
1251
+ "learning_rate": 0.001,
1252
+ "loss": 8.0392,
1253
+ "step": 20300
1254
+ },
1255
+ {
1256
+ "epoch": 0.32,
1257
+ "learning_rate": 0.001,
1258
+ "loss": 8.0422,
1259
+ "step": 20400
1260
+ },
1261
+ {
1262
+ "epoch": 0.32,
1263
+ "learning_rate": 0.001,
1264
+ "loss": 8.0386,
1265
+ "step": 20500
1266
+ },
1267
+ {
1268
+ "epoch": 0.32,
1269
+ "learning_rate": 0.001,
1270
+ "loss": 8.0504,
1271
+ "step": 20600
1272
+ },
1273
+ {
1274
+ "epoch": 0.32,
1275
+ "learning_rate": 0.001,
1276
+ "loss": 8.0483,
1277
+ "step": 20700
1278
+ },
1279
+ {
1280
+ "epoch": 0.33,
1281
+ "learning_rate": 0.001,
1282
+ "loss": 8.0463,
1283
+ "step": 20800
1284
+ },
1285
+ {
1286
+ "epoch": 0.33,
1287
+ "learning_rate": 0.001,
1288
+ "loss": 8.0545,
1289
+ "step": 20900
1290
+ },
1291
+ {
1292
+ "epoch": 0.33,
1293
+ "learning_rate": 0.001,
1294
+ "loss": 8.0491,
1295
+ "step": 21000
1296
+ },
1297
+ {
1298
+ "epoch": 0.33,
1299
+ "learning_rate": 0.001,
1300
+ "loss": 8.0336,
1301
+ "step": 21100
1302
+ },
1303
+ {
1304
+ "epoch": 0.33,
1305
+ "learning_rate": 0.001,
1306
+ "loss": 8.0372,
1307
+ "step": 21200
1308
+ },
1309
+ {
1310
+ "epoch": 0.33,
1311
+ "learning_rate": 0.001,
1312
+ "loss": 8.0405,
1313
+ "step": 21300
1314
+ },
1315
+ {
1316
+ "epoch": 0.33,
1317
+ "learning_rate": 0.001,
1318
+ "loss": 8.028,
1319
+ "step": 21400
1320
+ },
1321
+ {
1322
+ "epoch": 0.34,
1323
+ "learning_rate": 0.001,
1324
+ "loss": 8.0234,
1325
+ "step": 21500
1326
+ },
1327
+ {
1328
+ "epoch": 0.34,
1329
+ "learning_rate": 0.001,
1330
+ "loss": 8.0256,
1331
+ "step": 21600
1332
+ },
1333
+ {
1334
+ "epoch": 0.34,
1335
+ "learning_rate": 0.001,
1336
+ "loss": 8.028,
1337
+ "step": 21700
1338
+ },
1339
+ {
1340
+ "epoch": 0.34,
1341
+ "learning_rate": 0.001,
1342
+ "loss": 8.0338,
1343
+ "step": 21800
1344
+ },
1345
+ {
1346
+ "epoch": 0.34,
1347
+ "learning_rate": 0.001,
1348
+ "loss": 8.0345,
1349
+ "step": 21900
1350
+ },
1351
+ {
1352
+ "epoch": 0.34,
1353
+ "learning_rate": 0.001,
1354
+ "loss": 8.0353,
1355
+ "step": 22000
1356
+ },
1357
+ {
1358
+ "epoch": 0.35,
1359
+ "learning_rate": 0.001,
1360
+ "loss": 8.0426,
1361
+ "step": 22100
1362
+ },
1363
+ {
1364
+ "epoch": 0.35,
1365
+ "learning_rate": 0.001,
1366
+ "loss": 8.0318,
1367
+ "step": 22200
1368
+ },
1369
+ {
1370
+ "epoch": 0.35,
1371
+ "learning_rate": 0.001,
1372
+ "loss": 8.0341,
1373
+ "step": 22300
1374
+ },
1375
+ {
1376
+ "epoch": 0.35,
1377
+ "learning_rate": 0.001,
1378
+ "loss": 8.026,
1379
+ "step": 22400
1380
+ },
1381
+ {
1382
+ "epoch": 0.35,
1383
+ "learning_rate": 0.001,
1384
+ "loss": 8.0333,
1385
+ "step": 22500
1386
+ },
1387
+ {
1388
+ "epoch": 0.35,
1389
+ "learning_rate": 0.001,
1390
+ "loss": 8.0265,
1391
+ "step": 22600
1392
+ },
1393
+ {
1394
+ "epoch": 0.35,
1395
+ "learning_rate": 0.001,
1396
+ "loss": 8.0764,
1397
+ "step": 22700
1398
+ },
1399
+ {
1400
+ "epoch": 0.36,
1401
+ "learning_rate": 0.001,
1402
+ "loss": 8.0355,
1403
+ "step": 22800
1404
+ },
1405
+ {
1406
+ "epoch": 0.36,
1407
+ "learning_rate": 0.001,
1408
+ "loss": 8.0472,
1409
+ "step": 22900
1410
+ },
1411
+ {
1412
+ "epoch": 0.36,
1413
+ "learning_rate": 0.001,
1414
+ "loss": 8.0376,
1415
+ "step": 23000
1416
+ },
1417
+ {
1418
+ "epoch": 0.36,
1419
+ "learning_rate": 0.001,
1420
+ "loss": 8.0335,
1421
+ "step": 23100
1422
+ },
1423
+ {
1424
+ "epoch": 0.36,
1425
+ "learning_rate": 0.001,
1426
+ "loss": 8.0429,
1427
+ "step": 23200
1428
+ },
1429
+ {
1430
+ "epoch": 0.36,
1431
+ "learning_rate": 0.001,
1432
+ "loss": 8.0431,
1433
+ "step": 23300
1434
+ },
1435
+ {
1436
+ "epoch": 0.37,
1437
+ "learning_rate": 0.001,
1438
+ "loss": 8.0259,
1439
+ "step": 23400
1440
+ },
1441
+ {
1442
+ "epoch": 0.37,
1443
+ "learning_rate": 0.001,
1444
+ "loss": 8.0339,
1445
+ "step": 23500
1446
+ },
1447
+ {
1448
+ "epoch": 0.37,
1449
+ "learning_rate": 0.001,
1450
+ "loss": 8.0192,
1451
+ "step": 23600
1452
+ },
1453
+ {
1454
+ "epoch": 0.37,
1455
+ "learning_rate": 0.001,
1456
+ "loss": 8.0515,
1457
+ "step": 23700
1458
+ },
1459
+ {
1460
+ "epoch": 0.37,
1461
+ "learning_rate": 0.001,
1462
+ "loss": 8.0258,
1463
+ "step": 23800
1464
+ },
1465
+ {
1466
+ "epoch": 0.37,
1467
+ "learning_rate": 0.001,
1468
+ "loss": 8.0422,
1469
+ "step": 23900
1470
+ },
1471
+ {
1472
+ "epoch": 0.38,
1473
+ "learning_rate": 0.001,
1474
+ "loss": 8.0326,
1475
+ "step": 24000
1476
+ },
1477
+ {
1478
+ "epoch": 0.38,
1479
+ "learning_rate": 0.001,
1480
+ "loss": 8.0392,
1481
+ "step": 24100
1482
+ },
1483
+ {
1484
+ "epoch": 0.38,
1485
+ "learning_rate": 0.001,
1486
+ "loss": 8.0414,
1487
+ "step": 24200
1488
+ },
1489
+ {
1490
+ "epoch": 0.38,
1491
+ "learning_rate": 0.001,
1492
+ "loss": 8.0505,
1493
+ "step": 24300
1494
+ },
1495
+ {
1496
+ "epoch": 0.38,
1497
+ "learning_rate": 0.001,
1498
+ "loss": 8.0267,
1499
+ "step": 24400
1500
+ },
1501
+ {
1502
+ "epoch": 0.38,
1503
+ "learning_rate": 0.001,
1504
+ "loss": 8.0391,
1505
+ "step": 24500
1506
+ },
1507
+ {
1508
+ "epoch": 0.38,
1509
+ "learning_rate": 0.001,
1510
+ "loss": 8.0381,
1511
+ "step": 24600
1512
+ },
1513
+ {
1514
+ "epoch": 0.39,
1515
+ "learning_rate": 0.001,
1516
+ "loss": 8.0339,
1517
+ "step": 24700
1518
+ },
1519
+ {
1520
+ "epoch": 0.39,
1521
+ "learning_rate": 0.001,
1522
+ "loss": 8.0448,
1523
+ "step": 24800
1524
+ },
1525
+ {
1526
+ "epoch": 0.39,
1527
+ "learning_rate": 0.001,
1528
+ "loss": 8.0633,
1529
+ "step": 24900
1530
+ },
1531
+ {
1532
+ "epoch": 0.39,
1533
+ "learning_rate": 0.001,
1534
+ "loss": 8.0064,
1535
+ "step": 25000
1536
+ },
1537
+ {
1538
+ "epoch": 0.39,
1539
+ "learning_rate": 0.001,
1540
+ "loss": 8.0323,
1541
+ "step": 25100
1542
+ },
1543
+ {
1544
+ "epoch": 0.39,
1545
+ "learning_rate": 0.001,
1546
+ "loss": 8.0286,
1547
+ "step": 25200
1548
+ },
1549
+ {
1550
+ "epoch": 0.4,
1551
+ "learning_rate": 0.001,
1552
+ "loss": 8.0349,
1553
+ "step": 25300
1554
+ },
1555
+ {
1556
+ "epoch": 0.4,
1557
+ "learning_rate": 0.001,
1558
+ "loss": 8.0373,
1559
+ "step": 25400
1560
+ },
1561
+ {
1562
+ "epoch": 0.4,
1563
+ "learning_rate": 0.001,
1564
+ "loss": 8.0447,
1565
+ "step": 25500
1566
+ },
1567
+ {
1568
+ "epoch": 0.4,
1569
+ "learning_rate": 0.001,
1570
+ "loss": 8.0302,
1571
+ "step": 25600
1572
+ },
1573
+ {
1574
+ "epoch": 0.4,
1575
+ "eval_accuracy": 0.03333073104167396,
1576
+ "eval_loss": 8.036836624145508,
1577
+ "eval_runtime": 6892.0068,
1578
+ "eval_samples_per_second": 47.571,
1579
+ "eval_steps_per_second": 2.973,
1580
+ "step": 25600
1581
  }
1582
  ],
1583
  "max_steps": 64000,
1584
  "num_train_epochs": 9223372036854775807,
1585
+ "total_flos": 1.353967057502208e+17,
1586
  "trial_name": null,
1587
  "trial_params": null
1588
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7a084a9b3d69be038d1f70310127204c769b6f31132335d3c43f2359a442b86
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85efd468f59e090bda67c9e694bf55407f51a1a6d9bede51d725c6b288ff9330
3
  size 372832803