CreatorPhan commited on
Commit
3011d63
1 Parent(s): 5006939

Upload folder using huggingface_hub (#3)

Browse files

- Upload folder using huggingface_hub (4f400093aef44295e47459519fc4d9c41c29a909)

Files changed (5) hide show
  1. adapter_model.bin +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +3 -1203
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b7c71f5a48f92e6510bcb8484209f58b75e91833c9a1c22c3aac3c29d200327
3
  size 39409357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8beb03e0dedfb5b2da0d68ebcc49dbb502cea67ba9c0d70bed474bdde253aa1d
3
  size 39409357
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3319acdf659478ae0888e0b42c35dd789fcd6f3cf90e0350a5faa31dda4e7fbb
3
  size 78844421
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:745db72555229d45c6a69054bcdee18c9e9f3193c81ae335546612e8aaa6d7c4
3
  size 78844421
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:370c3a07f37a8aae6ea141b54ca992b21699546baf7407eb587b6056f787333b
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6162bb9db25c89c41e126a7a00a5d0695219447bff9b18d08731531620758440
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f71b8f28a925f4e713f8094eb164fc93a2eae54f8641cb0bc6cba1f8b907c73
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:461382c4e71db35cef694681fd3a0c229adb91f6f9d1e458e9d3c9be9149f8c4
3
  size 627
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 11.428571428571429,
5
  "eval_steps": 500,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1207,1213 +1207,13 @@
1207
  "learning_rate": 0.002642857142857143,
1208
  "loss": 1.678,
1209
  "step": 200
1210
- },
1211
- {
1212
- "epoch": 5.74,
1213
- "learning_rate": 0.0026410714285714284,
1214
- "loss": 1.7613,
1215
- "step": 201
1216
- },
1217
- {
1218
- "epoch": 5.77,
1219
- "learning_rate": 0.0026392857142857142,
1220
- "loss": 1.7541,
1221
- "step": 202
1222
- },
1223
- {
1224
- "epoch": 5.8,
1225
- "learning_rate": 0.0026375,
1226
- "loss": 1.798,
1227
- "step": 203
1228
- },
1229
- {
1230
- "epoch": 5.83,
1231
- "learning_rate": 0.002635714285714286,
1232
- "loss": 1.821,
1233
- "step": 204
1234
- },
1235
- {
1236
- "epoch": 5.86,
1237
- "learning_rate": 0.0026339285714285714,
1238
- "loss": 1.8385,
1239
- "step": 205
1240
- },
1241
- {
1242
- "epoch": 5.89,
1243
- "learning_rate": 0.002632142857142857,
1244
- "loss": 1.8613,
1245
- "step": 206
1246
- },
1247
- {
1248
- "epoch": 5.91,
1249
- "learning_rate": 0.002630357142857143,
1250
- "loss": 1.902,
1251
- "step": 207
1252
- },
1253
- {
1254
- "epoch": 5.94,
1255
- "learning_rate": 0.0026285714285714285,
1256
- "loss": 2.0848,
1257
- "step": 208
1258
- },
1259
- {
1260
- "epoch": 5.97,
1261
- "learning_rate": 0.0026267857142857143,
1262
- "loss": 2.3277,
1263
- "step": 209
1264
- },
1265
- {
1266
- "epoch": 6.0,
1267
- "learning_rate": 0.002625,
1268
- "loss": 2.8535,
1269
- "step": 210
1270
- },
1271
- {
1272
- "epoch": 6.03,
1273
- "learning_rate": 0.0026232142857142856,
1274
- "loss": 6.2197,
1275
- "step": 211
1276
- },
1277
- {
1278
- "epoch": 6.06,
1279
- "learning_rate": 0.0026214285714285714,
1280
- "loss": 10.2288,
1281
- "step": 212
1282
- },
1283
- {
1284
- "epoch": 6.09,
1285
- "learning_rate": 0.0026196428571428573,
1286
- "loss": 12.5006,
1287
- "step": 213
1288
- },
1289
- {
1290
- "epoch": 6.11,
1291
- "learning_rate": 0.002617857142857143,
1292
- "loss": 10.5184,
1293
- "step": 214
1294
- },
1295
- {
1296
- "epoch": 6.14,
1297
- "learning_rate": 0.0026160714285714285,
1298
- "loss": 9.4834,
1299
- "step": 215
1300
- },
1301
- {
1302
- "epoch": 6.17,
1303
- "learning_rate": 0.0026142857142857144,
1304
- "loss": 16.0513,
1305
- "step": 216
1306
- },
1307
- {
1308
- "epoch": 6.2,
1309
- "learning_rate": 0.0026125000000000002,
1310
- "loss": 11.0576,
1311
- "step": 217
1312
- },
1313
- {
1314
- "epoch": 6.23,
1315
- "learning_rate": 0.0026107142857142857,
1316
- "loss": 15.3574,
1317
- "step": 218
1318
- },
1319
- {
1320
- "epoch": 6.26,
1321
- "learning_rate": 0.0026089285714285715,
1322
- "loss": 15.5239,
1323
- "step": 219
1324
- },
1325
- {
1326
- "epoch": 6.29,
1327
- "learning_rate": 0.0026071428571428574,
1328
- "loss": 15.3973,
1329
- "step": 220
1330
- },
1331
- {
1332
- "epoch": 6.31,
1333
- "learning_rate": 0.0026053571428571428,
1334
- "loss": 12.059,
1335
- "step": 221
1336
- },
1337
- {
1338
- "epoch": 6.34,
1339
- "learning_rate": 0.0026035714285714286,
1340
- "loss": 10.8352,
1341
- "step": 222
1342
- },
1343
- {
1344
- "epoch": 6.37,
1345
- "learning_rate": 0.0026017857142857145,
1346
- "loss": 10.1507,
1347
- "step": 223
1348
- },
1349
- {
1350
- "epoch": 6.4,
1351
- "learning_rate": 0.0026000000000000003,
1352
- "loss": 10.651,
1353
- "step": 224
1354
- },
1355
- {
1356
- "epoch": 6.43,
1357
- "learning_rate": 0.0025982142857142857,
1358
- "loss": 9.8363,
1359
- "step": 225
1360
- },
1361
- {
1362
- "epoch": 6.46,
1363
- "learning_rate": 0.0025964285714285716,
1364
- "loss": 9.3673,
1365
- "step": 226
1366
- },
1367
- {
1368
- "epoch": 6.49,
1369
- "learning_rate": 0.0025946428571428574,
1370
- "loss": 9.5433,
1371
- "step": 227
1372
- },
1373
- {
1374
- "epoch": 6.51,
1375
- "learning_rate": 0.002592857142857143,
1376
- "loss": 9.9206,
1377
- "step": 228
1378
- },
1379
- {
1380
- "epoch": 6.54,
1381
- "learning_rate": 0.0025910714285714287,
1382
- "loss": 9.5516,
1383
- "step": 229
1384
- },
1385
- {
1386
- "epoch": 6.57,
1387
- "learning_rate": 0.0025892857142857145,
1388
- "loss": 9.2165,
1389
- "step": 230
1390
- },
1391
- {
1392
- "epoch": 6.6,
1393
- "learning_rate": 0.0025875000000000004,
1394
- "loss": 9.0825,
1395
- "step": 231
1396
- },
1397
- {
1398
- "epoch": 6.63,
1399
- "learning_rate": 0.002585714285714286,
1400
- "loss": 8.7437,
1401
- "step": 232
1402
- },
1403
- {
1404
- "epoch": 6.66,
1405
- "learning_rate": 0.0025839285714285717,
1406
- "loss": 8.6366,
1407
- "step": 233
1408
- },
1409
- {
1410
- "epoch": 6.69,
1411
- "learning_rate": 0.0025821428571428575,
1412
- "loss": 9.7431,
1413
- "step": 234
1414
- },
1415
- {
1416
- "epoch": 6.71,
1417
- "learning_rate": 0.002580357142857143,
1418
- "loss": 8.1876,
1419
- "step": 235
1420
- },
1421
- {
1422
- "epoch": 6.74,
1423
- "learning_rate": 0.0025785714285714288,
1424
- "loss": 8.4559,
1425
- "step": 236
1426
- },
1427
- {
1428
- "epoch": 6.77,
1429
- "learning_rate": 0.002576785714285714,
1430
- "loss": 8.0092,
1431
- "step": 237
1432
- },
1433
- {
1434
- "epoch": 6.8,
1435
- "learning_rate": 0.002575,
1436
- "loss": 8.028,
1437
- "step": 238
1438
- },
1439
- {
1440
- "epoch": 6.83,
1441
- "learning_rate": 0.0025732142857142854,
1442
- "loss": 7.8379,
1443
- "step": 239
1444
- },
1445
- {
1446
- "epoch": 6.86,
1447
- "learning_rate": 0.0025714285714285713,
1448
- "loss": 7.8127,
1449
- "step": 240
1450
- },
1451
- {
1452
- "epoch": 6.89,
1453
- "learning_rate": 0.002569642857142857,
1454
- "loss": 7.8252,
1455
- "step": 241
1456
- },
1457
- {
1458
- "epoch": 6.91,
1459
- "learning_rate": 0.002567857142857143,
1460
- "loss": 7.7094,
1461
- "step": 242
1462
- },
1463
- {
1464
- "epoch": 6.94,
1465
- "learning_rate": 0.0025660714285714284,
1466
- "loss": 7.7962,
1467
- "step": 243
1468
- },
1469
- {
1470
- "epoch": 6.97,
1471
- "learning_rate": 0.0025642857142857143,
1472
- "loss": 7.4966,
1473
- "step": 244
1474
- },
1475
- {
1476
- "epoch": 7.0,
1477
- "learning_rate": 0.0025625,
1478
- "loss": 7.4851,
1479
- "step": 245
1480
- },
1481
- {
1482
- "epoch": 7.03,
1483
- "learning_rate": 0.0025607142857142855,
1484
- "loss": 7.5188,
1485
- "step": 246
1486
- },
1487
- {
1488
- "epoch": 7.06,
1489
- "learning_rate": 0.0025589285714285714,
1490
- "loss": 7.7866,
1491
- "step": 247
1492
- },
1493
- {
1494
- "epoch": 7.09,
1495
- "learning_rate": 0.0025571428571428572,
1496
- "loss": 7.5743,
1497
- "step": 248
1498
- },
1499
- {
1500
- "epoch": 7.11,
1501
- "learning_rate": 0.0025553571428571426,
1502
- "loss": 7.4608,
1503
- "step": 249
1504
- },
1505
- {
1506
- "epoch": 7.14,
1507
- "learning_rate": 0.0025535714285714285,
1508
- "loss": 7.4655,
1509
- "step": 250
1510
- },
1511
- {
1512
- "epoch": 7.17,
1513
- "learning_rate": 0.0025517857142857143,
1514
- "loss": 7.5474,
1515
- "step": 251
1516
- },
1517
- {
1518
- "epoch": 7.2,
1519
- "learning_rate": 0.00255,
1520
- "loss": 7.6983,
1521
- "step": 252
1522
- },
1523
- {
1524
- "epoch": 7.23,
1525
- "learning_rate": 0.0025482142857142856,
1526
- "loss": 7.4936,
1527
- "step": 253
1528
- },
1529
- {
1530
- "epoch": 7.26,
1531
- "learning_rate": 0.0025464285714285714,
1532
- "loss": 7.6966,
1533
- "step": 254
1534
- },
1535
- {
1536
- "epoch": 7.29,
1537
- "learning_rate": 0.0025446428571428573,
1538
- "loss": 7.4701,
1539
- "step": 255
1540
- },
1541
- {
1542
- "epoch": 7.31,
1543
- "learning_rate": 0.0025428571428571427,
1544
- "loss": 7.511,
1545
- "step": 256
1546
- },
1547
- {
1548
- "epoch": 7.34,
1549
- "learning_rate": 0.0025410714285714286,
1550
- "loss": 7.3709,
1551
- "step": 257
1552
- },
1553
- {
1554
- "epoch": 7.37,
1555
- "learning_rate": 0.0025392857142857144,
1556
- "loss": 7.4582,
1557
- "step": 258
1558
- },
1559
- {
1560
- "epoch": 7.4,
1561
- "learning_rate": 0.0025375,
1562
- "loss": 7.4263,
1563
- "step": 259
1564
- },
1565
- {
1566
- "epoch": 7.43,
1567
- "learning_rate": 0.0025357142857142857,
1568
- "loss": 7.3134,
1569
- "step": 260
1570
- },
1571
- {
1572
- "epoch": 7.46,
1573
- "learning_rate": 0.0025339285714285715,
1574
- "loss": 7.3849,
1575
- "step": 261
1576
- },
1577
- {
1578
- "epoch": 7.49,
1579
- "learning_rate": 0.0025321428571428574,
1580
- "loss": 7.292,
1581
- "step": 262
1582
- },
1583
- {
1584
- "epoch": 7.51,
1585
- "learning_rate": 0.002530357142857143,
1586
- "loss": 7.343,
1587
- "step": 263
1588
- },
1589
- {
1590
- "epoch": 7.54,
1591
- "learning_rate": 0.0025285714285714286,
1592
- "loss": 7.3166,
1593
- "step": 264
1594
- },
1595
- {
1596
- "epoch": 7.57,
1597
- "learning_rate": 0.0025267857142857145,
1598
- "loss": 7.2676,
1599
- "step": 265
1600
- },
1601
- {
1602
- "epoch": 7.6,
1603
- "learning_rate": 0.002525,
1604
- "loss": 7.2955,
1605
- "step": 266
1606
- },
1607
- {
1608
- "epoch": 7.63,
1609
- "learning_rate": 0.0025232142857142857,
1610
- "loss": 7.3386,
1611
- "step": 267
1612
- },
1613
- {
1614
- "epoch": 7.66,
1615
- "learning_rate": 0.0025214285714285716,
1616
- "loss": 7.2682,
1617
- "step": 268
1618
- },
1619
- {
1620
- "epoch": 7.69,
1621
- "learning_rate": 0.0025196428571428574,
1622
- "loss": 7.2359,
1623
- "step": 269
1624
- },
1625
- {
1626
- "epoch": 7.71,
1627
- "learning_rate": 0.002517857142857143,
1628
- "loss": 7.1849,
1629
- "step": 270
1630
- },
1631
- {
1632
- "epoch": 7.74,
1633
- "learning_rate": 0.0025160714285714287,
1634
- "loss": 7.2421,
1635
- "step": 271
1636
- },
1637
- {
1638
- "epoch": 7.77,
1639
- "learning_rate": 0.0025142857142857146,
1640
- "loss": 7.2341,
1641
- "step": 272
1642
- },
1643
- {
1644
- "epoch": 7.8,
1645
- "learning_rate": 0.0025125,
1646
- "loss": 7.2901,
1647
- "step": 273
1648
- },
1649
- {
1650
- "epoch": 7.83,
1651
- "learning_rate": 0.002510714285714286,
1652
- "loss": 7.1931,
1653
- "step": 274
1654
- },
1655
- {
1656
- "epoch": 7.86,
1657
- "learning_rate": 0.0025089285714285717,
1658
- "loss": 7.1907,
1659
- "step": 275
1660
- },
1661
- {
1662
- "epoch": 7.89,
1663
- "learning_rate": 0.002507142857142857,
1664
- "loss": 7.2369,
1665
- "step": 276
1666
- },
1667
- {
1668
- "epoch": 7.91,
1669
- "learning_rate": 0.002505357142857143,
1670
- "loss": 7.1764,
1671
- "step": 277
1672
- },
1673
- {
1674
- "epoch": 7.94,
1675
- "learning_rate": 0.002503571428571429,
1676
- "loss": 7.1928,
1677
- "step": 278
1678
- },
1679
- {
1680
- "epoch": 7.97,
1681
- "learning_rate": 0.0025017857142857146,
1682
- "loss": 7.2114,
1683
- "step": 279
1684
- },
1685
- {
1686
- "epoch": 8.0,
1687
- "learning_rate": 0.0025,
1688
- "loss": 7.2307,
1689
- "step": 280
1690
- },
1691
- {
1692
- "epoch": 8.03,
1693
- "learning_rate": 0.002498214285714286,
1694
- "loss": 7.2477,
1695
- "step": 281
1696
- },
1697
- {
1698
- "epoch": 8.06,
1699
- "learning_rate": 0.0024964285714285718,
1700
- "loss": 7.2069,
1701
- "step": 282
1702
- },
1703
- {
1704
- "epoch": 8.09,
1705
- "learning_rate": 0.002494642857142857,
1706
- "loss": 7.1484,
1707
- "step": 283
1708
- },
1709
- {
1710
- "epoch": 8.11,
1711
- "learning_rate": 0.002492857142857143,
1712
- "loss": 7.1076,
1713
- "step": 284
1714
- },
1715
- {
1716
- "epoch": 8.14,
1717
- "learning_rate": 0.002491071428571429,
1718
- "loss": 7.0819,
1719
- "step": 285
1720
- },
1721
- {
1722
- "epoch": 8.17,
1723
- "learning_rate": 0.0024892857142857143,
1724
- "loss": 7.0708,
1725
- "step": 286
1726
- },
1727
- {
1728
- "epoch": 8.2,
1729
- "learning_rate": 0.0024875,
1730
- "loss": 7.0763,
1731
- "step": 287
1732
- },
1733
- {
1734
- "epoch": 8.23,
1735
- "learning_rate": 0.002485714285714286,
1736
- "loss": 7.0792,
1737
- "step": 288
1738
- },
1739
- {
1740
- "epoch": 8.26,
1741
- "learning_rate": 0.0024839285714285714,
1742
- "loss": 7.1397,
1743
- "step": 289
1744
- },
1745
- {
1746
- "epoch": 8.29,
1747
- "learning_rate": 0.0024821428571428572,
1748
- "loss": 7.0893,
1749
- "step": 290
1750
- },
1751
- {
1752
- "epoch": 8.31,
1753
- "learning_rate": 0.0024803571428571427,
1754
- "loss": 7.1263,
1755
- "step": 291
1756
- },
1757
- {
1758
- "epoch": 8.34,
1759
- "learning_rate": 0.0024785714285714285,
1760
- "loss": 7.0226,
1761
- "step": 292
1762
- },
1763
- {
1764
- "epoch": 8.37,
1765
- "learning_rate": 0.0024767857142857144,
1766
- "loss": 7.1017,
1767
- "step": 293
1768
- },
1769
- {
1770
- "epoch": 8.4,
1771
- "learning_rate": 0.0024749999999999998,
1772
- "loss": 7.0161,
1773
- "step": 294
1774
- },
1775
- {
1776
- "epoch": 8.43,
1777
- "learning_rate": 0.0024732142857142856,
1778
- "loss": 7.117,
1779
- "step": 295
1780
- },
1781
- {
1782
- "epoch": 8.46,
1783
- "learning_rate": 0.0024714285714285715,
1784
- "loss": 7.0234,
1785
- "step": 296
1786
- },
1787
- {
1788
- "epoch": 8.49,
1789
- "learning_rate": 0.002469642857142857,
1790
- "loss": 7.0663,
1791
- "step": 297
1792
- },
1793
- {
1794
- "epoch": 8.51,
1795
- "learning_rate": 0.0024678571428571427,
1796
- "loss": 7.1604,
1797
- "step": 298
1798
- },
1799
- {
1800
- "epoch": 8.54,
1801
- "learning_rate": 0.0024660714285714286,
1802
- "loss": 7.0543,
1803
- "step": 299
1804
- },
1805
- {
1806
- "epoch": 8.57,
1807
- "learning_rate": 0.0024642857142857144,
1808
- "loss": 7.0131,
1809
- "step": 300
1810
- },
1811
- {
1812
- "epoch": 8.6,
1813
- "learning_rate": 0.0024625,
1814
- "loss": 7.0294,
1815
- "step": 301
1816
- },
1817
- {
1818
- "epoch": 8.63,
1819
- "learning_rate": 0.0024607142857142857,
1820
- "loss": 7.0273,
1821
- "step": 302
1822
- },
1823
- {
1824
- "epoch": 8.66,
1825
- "learning_rate": 0.0024589285714285715,
1826
- "loss": 7.0074,
1827
- "step": 303
1828
- },
1829
- {
1830
- "epoch": 8.69,
1831
- "learning_rate": 0.002457142857142857,
1832
- "loss": 6.9747,
1833
- "step": 304
1834
- },
1835
- {
1836
- "epoch": 8.71,
1837
- "learning_rate": 0.002455357142857143,
1838
- "loss": 7.0617,
1839
- "step": 305
1840
- },
1841
- {
1842
- "epoch": 8.74,
1843
- "learning_rate": 0.0024535714285714287,
1844
- "loss": 7.0907,
1845
- "step": 306
1846
- },
1847
- {
1848
- "epoch": 8.77,
1849
- "learning_rate": 0.002451785714285714,
1850
- "loss": 7.0037,
1851
- "step": 307
1852
- },
1853
- {
1854
- "epoch": 8.8,
1855
- "learning_rate": 0.00245,
1856
- "loss": 6.969,
1857
- "step": 308
1858
- },
1859
- {
1860
- "epoch": 8.83,
1861
- "learning_rate": 0.0024482142857142858,
1862
- "loss": 7.0575,
1863
- "step": 309
1864
- },
1865
- {
1866
- "epoch": 8.86,
1867
- "learning_rate": 0.0024464285714285716,
1868
- "loss": 6.9494,
1869
- "step": 310
1870
- },
1871
- {
1872
- "epoch": 8.89,
1873
- "learning_rate": 0.002444642857142857,
1874
- "loss": 6.969,
1875
- "step": 311
1876
- },
1877
- {
1878
- "epoch": 8.91,
1879
- "learning_rate": 0.002442857142857143,
1880
- "loss": 6.8827,
1881
- "step": 312
1882
- },
1883
- {
1884
- "epoch": 8.94,
1885
- "learning_rate": 0.0024410714285714287,
1886
- "loss": 6.9058,
1887
- "step": 313
1888
- },
1889
- {
1890
- "epoch": 8.97,
1891
- "learning_rate": 0.002439285714285714,
1892
- "loss": 6.8808,
1893
- "step": 314
1894
- },
1895
- {
1896
- "epoch": 9.0,
1897
- "learning_rate": 0.0024375,
1898
- "loss": 6.9516,
1899
- "step": 315
1900
- },
1901
- {
1902
- "epoch": 9.03,
1903
- "learning_rate": 0.002435714285714286,
1904
- "loss": 6.9132,
1905
- "step": 316
1906
- },
1907
- {
1908
- "epoch": 9.06,
1909
- "learning_rate": 0.0024339285714285717,
1910
- "loss": 6.9058,
1911
- "step": 317
1912
- },
1913
- {
1914
- "epoch": 9.09,
1915
- "learning_rate": 0.002432142857142857,
1916
- "loss": 6.9332,
1917
- "step": 318
1918
- },
1919
- {
1920
- "epoch": 9.11,
1921
- "learning_rate": 0.002430357142857143,
1922
- "loss": 6.9757,
1923
- "step": 319
1924
- },
1925
- {
1926
- "epoch": 9.14,
1927
- "learning_rate": 0.002428571428571429,
1928
- "loss": 6.8261,
1929
- "step": 320
1930
- },
1931
- {
1932
- "epoch": 9.17,
1933
- "learning_rate": 0.0024267857142857142,
1934
- "loss": 6.8571,
1935
- "step": 321
1936
- },
1937
- {
1938
- "epoch": 9.2,
1939
- "learning_rate": 0.002425,
1940
- "loss": 6.8435,
1941
- "step": 322
1942
- },
1943
- {
1944
- "epoch": 9.23,
1945
- "learning_rate": 0.002423214285714286,
1946
- "loss": 6.9033,
1947
- "step": 323
1948
- },
1949
- {
1950
- "epoch": 9.26,
1951
- "learning_rate": 0.0024214285714285713,
1952
- "loss": 6.8042,
1953
- "step": 324
1954
- },
1955
- {
1956
- "epoch": 9.29,
1957
- "learning_rate": 0.002419642857142857,
1958
- "loss": 6.8732,
1959
- "step": 325
1960
- },
1961
- {
1962
- "epoch": 9.31,
1963
- "learning_rate": 0.002417857142857143,
1964
- "loss": 6.752,
1965
- "step": 326
1966
- },
1967
- {
1968
- "epoch": 9.34,
1969
- "learning_rate": 0.002416071428571429,
1970
- "loss": 6.8016,
1971
- "step": 327
1972
- },
1973
- {
1974
- "epoch": 9.37,
1975
- "learning_rate": 0.0024142857142857143,
1976
- "loss": 6.8879,
1977
- "step": 328
1978
- },
1979
- {
1980
- "epoch": 9.4,
1981
- "learning_rate": 0.0024125,
1982
- "loss": 6.7643,
1983
- "step": 329
1984
- },
1985
- {
1986
- "epoch": 9.43,
1987
- "learning_rate": 0.002410714285714286,
1988
- "loss": 6.7084,
1989
- "step": 330
1990
- },
1991
- {
1992
- "epoch": 9.46,
1993
- "learning_rate": 0.0024089285714285714,
1994
- "loss": 6.8049,
1995
- "step": 331
1996
- },
1997
- {
1998
- "epoch": 9.49,
1999
- "learning_rate": 0.0024071428571428573,
2000
- "loss": 6.7925,
2001
- "step": 332
2002
- },
2003
- {
2004
- "epoch": 9.51,
2005
- "learning_rate": 0.002405357142857143,
2006
- "loss": 6.7289,
2007
- "step": 333
2008
- },
2009
- {
2010
- "epoch": 9.54,
2011
- "learning_rate": 0.0024035714285714285,
2012
- "loss": 6.7439,
2013
- "step": 334
2014
- },
2015
- {
2016
- "epoch": 9.57,
2017
- "learning_rate": 0.0024017857142857144,
2018
- "loss": 6.7119,
2019
- "step": 335
2020
- },
2021
- {
2022
- "epoch": 9.6,
2023
- "learning_rate": 0.0024000000000000002,
2024
- "loss": 6.7251,
2025
- "step": 336
2026
- },
2027
- {
2028
- "epoch": 9.63,
2029
- "learning_rate": 0.002398214285714286,
2030
- "loss": 6.6659,
2031
- "step": 337
2032
- },
2033
- {
2034
- "epoch": 9.66,
2035
- "learning_rate": 0.0023964285714285715,
2036
- "loss": 6.7422,
2037
- "step": 338
2038
- },
2039
- {
2040
- "epoch": 9.69,
2041
- "learning_rate": 0.0023946428571428573,
2042
- "loss": 6.7852,
2043
- "step": 339
2044
- },
2045
- {
2046
- "epoch": 9.71,
2047
- "learning_rate": 0.002392857142857143,
2048
- "loss": 6.6828,
2049
- "step": 340
2050
- },
2051
- {
2052
- "epoch": 9.74,
2053
- "learning_rate": 0.0023910714285714286,
2054
- "loss": 6.686,
2055
- "step": 341
2056
- },
2057
- {
2058
- "epoch": 9.77,
2059
- "learning_rate": 0.002389285714285714,
2060
- "loss": 6.7326,
2061
- "step": 342
2062
- },
2063
- {
2064
- "epoch": 9.8,
2065
- "learning_rate": 0.0023875,
2066
- "loss": 6.5601,
2067
- "step": 343
2068
- },
2069
- {
2070
- "epoch": 9.83,
2071
- "learning_rate": 0.0023857142857142857,
2072
- "loss": 6.6646,
2073
- "step": 344
2074
- },
2075
- {
2076
- "epoch": 9.86,
2077
- "learning_rate": 0.002383928571428571,
2078
- "loss": 6.5673,
2079
- "step": 345
2080
- },
2081
- {
2082
- "epoch": 9.89,
2083
- "learning_rate": 0.002382142857142857,
2084
- "loss": 6.6227,
2085
- "step": 346
2086
- },
2087
- {
2088
- "epoch": 9.91,
2089
- "learning_rate": 0.002380357142857143,
2090
- "loss": 6.5526,
2091
- "step": 347
2092
- },
2093
- {
2094
- "epoch": 9.94,
2095
- "learning_rate": 0.0023785714285714287,
2096
- "loss": 6.6842,
2097
- "step": 348
2098
- },
2099
- {
2100
- "epoch": 9.97,
2101
- "learning_rate": 0.002376785714285714,
2102
- "loss": 6.6211,
2103
- "step": 349
2104
- },
2105
- {
2106
- "epoch": 10.0,
2107
- "learning_rate": 0.002375,
2108
- "loss": 6.6952,
2109
- "step": 350
2110
- },
2111
- {
2112
- "epoch": 10.03,
2113
- "learning_rate": 0.002373214285714286,
2114
- "loss": 6.5324,
2115
- "step": 351
2116
- },
2117
- {
2118
- "epoch": 10.06,
2119
- "learning_rate": 0.002371428571428571,
2120
- "loss": 6.5792,
2121
- "step": 352
2122
- },
2123
- {
2124
- "epoch": 10.09,
2125
- "learning_rate": 0.002369642857142857,
2126
- "loss": 6.5276,
2127
- "step": 353
2128
- },
2129
- {
2130
- "epoch": 10.11,
2131
- "learning_rate": 0.002367857142857143,
2132
- "loss": 6.5634,
2133
- "step": 354
2134
- },
2135
- {
2136
- "epoch": 10.14,
2137
- "learning_rate": 0.0023660714285714288,
2138
- "loss": 6.5385,
2139
- "step": 355
2140
- },
2141
- {
2142
- "epoch": 10.17,
2143
- "learning_rate": 0.002364285714285714,
2144
- "loss": 6.4516,
2145
- "step": 356
2146
- },
2147
- {
2148
- "epoch": 10.2,
2149
- "learning_rate": 0.0023625,
2150
- "loss": 6.5641,
2151
- "step": 357
2152
- },
2153
- {
2154
- "epoch": 10.23,
2155
- "learning_rate": 0.002360714285714286,
2156
- "loss": 6.5001,
2157
- "step": 358
2158
- },
2159
- {
2160
- "epoch": 10.26,
2161
- "learning_rate": 0.0023589285714285713,
2162
- "loss": 6.4846,
2163
- "step": 359
2164
- },
2165
- {
2166
- "epoch": 10.29,
2167
- "learning_rate": 0.002357142857142857,
2168
- "loss": 6.4638,
2169
- "step": 360
2170
- },
2171
- {
2172
- "epoch": 10.31,
2173
- "learning_rate": 0.002355357142857143,
2174
- "loss": 6.5217,
2175
- "step": 361
2176
- },
2177
- {
2178
- "epoch": 10.34,
2179
- "learning_rate": 0.0023535714285714284,
2180
- "loss": 6.5444,
2181
- "step": 362
2182
- },
2183
- {
2184
- "epoch": 10.37,
2185
- "learning_rate": 0.0023517857142857142,
2186
- "loss": 6.496,
2187
- "step": 363
2188
- },
2189
- {
2190
- "epoch": 10.4,
2191
- "learning_rate": 0.00235,
2192
- "loss": 6.5345,
2193
- "step": 364
2194
- },
2195
- {
2196
- "epoch": 10.43,
2197
- "learning_rate": 0.002348214285714286,
2198
- "loss": 6.4732,
2199
- "step": 365
2200
- },
2201
- {
2202
- "epoch": 10.46,
2203
- "learning_rate": 0.0023464285714285714,
2204
- "loss": 6.4765,
2205
- "step": 366
2206
- },
2207
- {
2208
- "epoch": 10.49,
2209
- "learning_rate": 0.002344642857142857,
2210
- "loss": 6.3881,
2211
- "step": 367
2212
- },
2213
- {
2214
- "epoch": 10.51,
2215
- "learning_rate": 0.002342857142857143,
2216
- "loss": 6.4908,
2217
- "step": 368
2218
- },
2219
- {
2220
- "epoch": 10.54,
2221
- "learning_rate": 0.0023410714285714285,
2222
- "loss": 6.4593,
2223
- "step": 369
2224
- },
2225
- {
2226
- "epoch": 10.57,
2227
- "learning_rate": 0.0023392857142857143,
2228
- "loss": 6.5006,
2229
- "step": 370
2230
- },
2231
- {
2232
- "epoch": 10.6,
2233
- "learning_rate": 0.0023375,
2234
- "loss": 6.4495,
2235
- "step": 371
2236
- },
2237
- {
2238
- "epoch": 10.63,
2239
- "learning_rate": 0.0023357142857142856,
2240
- "loss": 6.3569,
2241
- "step": 372
2242
- },
2243
- {
2244
- "epoch": 10.66,
2245
- "learning_rate": 0.0023339285714285714,
2246
- "loss": 6.3592,
2247
- "step": 373
2248
- },
2249
- {
2250
- "epoch": 10.69,
2251
- "learning_rate": 0.0023321428571428573,
2252
- "loss": 6.3258,
2253
- "step": 374
2254
- },
2255
- {
2256
- "epoch": 10.71,
2257
- "learning_rate": 0.002330357142857143,
2258
- "loss": 6.3216,
2259
- "step": 375
2260
- },
2261
- {
2262
- "epoch": 10.74,
2263
- "learning_rate": 0.0023285714285714285,
2264
- "loss": 6.4878,
2265
- "step": 376
2266
- },
2267
- {
2268
- "epoch": 10.77,
2269
- "learning_rate": 0.0023267857142857144,
2270
- "loss": 6.3412,
2271
- "step": 377
2272
- },
2273
- {
2274
- "epoch": 10.8,
2275
- "learning_rate": 0.0023250000000000002,
2276
- "loss": 6.3925,
2277
- "step": 378
2278
- },
2279
- {
2280
- "epoch": 10.83,
2281
- "learning_rate": 0.0023232142857142857,
2282
- "loss": 6.275,
2283
- "step": 379
2284
- },
2285
- {
2286
- "epoch": 10.86,
2287
- "learning_rate": 0.0023214285714285715,
2288
- "loss": 6.3575,
2289
- "step": 380
2290
- },
2291
- {
2292
- "epoch": 10.89,
2293
- "learning_rate": 0.0023196428571428574,
2294
- "loss": 6.3259,
2295
- "step": 381
2296
- },
2297
- {
2298
- "epoch": 10.91,
2299
- "learning_rate": 0.002317857142857143,
2300
- "loss": 6.315,
2301
- "step": 382
2302
- },
2303
- {
2304
- "epoch": 10.94,
2305
- "learning_rate": 0.0023160714285714286,
2306
- "loss": 6.277,
2307
- "step": 383
2308
- },
2309
- {
2310
- "epoch": 10.97,
2311
- "learning_rate": 0.0023142857142857145,
2312
- "loss": 6.3259,
2313
- "step": 384
2314
- },
2315
- {
2316
- "epoch": 11.0,
2317
- "learning_rate": 0.0023125000000000003,
2318
- "loss": 6.3747,
2319
- "step": 385
2320
- },
2321
- {
2322
- "epoch": 11.03,
2323
- "learning_rate": 0.0023107142857142857,
2324
- "loss": 6.3646,
2325
- "step": 386
2326
- },
2327
- {
2328
- "epoch": 11.06,
2329
- "learning_rate": 0.0023089285714285716,
2330
- "loss": 6.3687,
2331
- "step": 387
2332
- },
2333
- {
2334
- "epoch": 11.09,
2335
- "learning_rate": 0.0023071428571428574,
2336
- "loss": 6.3374,
2337
- "step": 388
2338
- },
2339
- {
2340
- "epoch": 11.11,
2341
- "learning_rate": 0.002305357142857143,
2342
- "loss": 6.3129,
2343
- "step": 389
2344
- },
2345
- {
2346
- "epoch": 11.14,
2347
- "learning_rate": 0.0023035714285714287,
2348
- "loss": 6.3425,
2349
- "step": 390
2350
- },
2351
- {
2352
- "epoch": 11.17,
2353
- "learning_rate": 0.0023017857142857145,
2354
- "loss": 6.2122,
2355
- "step": 391
2356
- },
2357
- {
2358
- "epoch": 11.2,
2359
- "learning_rate": 0.0023000000000000004,
2360
- "loss": 6.2768,
2361
- "step": 392
2362
- },
2363
- {
2364
- "epoch": 11.23,
2365
- "learning_rate": 0.002298214285714286,
2366
- "loss": 6.2853,
2367
- "step": 393
2368
- },
2369
- {
2370
- "epoch": 11.26,
2371
- "learning_rate": 0.0022964285714285712,
2372
- "loss": 6.3215,
2373
- "step": 394
2374
- },
2375
- {
2376
- "epoch": 11.29,
2377
- "learning_rate": 0.002294642857142857,
2378
- "loss": 6.3244,
2379
- "step": 395
2380
- },
2381
- {
2382
- "epoch": 11.31,
2383
- "learning_rate": 0.002292857142857143,
2384
- "loss": 6.2399,
2385
- "step": 396
2386
- },
2387
- {
2388
- "epoch": 11.34,
2389
- "learning_rate": 0.0022910714285714283,
2390
- "loss": 6.2457,
2391
- "step": 397
2392
- },
2393
- {
2394
- "epoch": 11.37,
2395
- "learning_rate": 0.002289285714285714,
2396
- "loss": 6.2018,
2397
- "step": 398
2398
- },
2399
- {
2400
- "epoch": 11.4,
2401
- "learning_rate": 0.0022875,
2402
- "loss": 6.2101,
2403
- "step": 399
2404
- },
2405
- {
2406
- "epoch": 11.43,
2407
- "learning_rate": 0.0022857142857142855,
2408
- "loss": 6.2257,
2409
- "step": 400
2410
  }
2411
  ],
2412
  "logging_steps": 1,
2413
  "max_steps": 1680,
2414
  "num_train_epochs": 48,
2415
  "save_steps": 100,
2416
- "total_flos": 2.3080655169481728e+17,
2417
  "trial_name": null,
2418
  "trial_params": null
2419
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.714285714285714,
5
  "eval_steps": 500,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1207
  "learning_rate": 0.002642857142857143,
1208
  "loss": 1.678,
1209
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1210
  }
1211
  ],
1212
  "logging_steps": 1,
1213
  "max_steps": 1680,
1214
  "num_train_epochs": 48,
1215
  "save_steps": 100,
1216
+ "total_flos": 1.1559978611371008e+17,
1217
  "trial_name": null,
1218
  "trial_params": null
1219
  }