longcld commited on
Commit
4aed245
1 Parent(s): cc70666
Files changed (5) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +693 -3
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7717cc77b9dfdc741573ac36af9500ebe261a63a279912b8f960b7c9e5242c99
3
  size 1585984129
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:641c85a58e4cda476ffb7fb5c7db5fc4127f3ae4e0e86f2f8596e568eec42bb4
3
  size 1585984129
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dbf7512968e9b49a9effc5e8b456b7594341266a4da30528a298080dd30eccb
3
  size 1561347007
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795bd227bc8849bbff4aa95b4e34cb546e42243310b55d6d27498891fd16ede9
3
  size 1561347007
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:076be2f9f01be38c4835c9081e22297aef567c05e412b1d30dc0e816e787e53a
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d554d225b4de5166bd769415b56e27db6938660c20b187cbf1afe1eaf730132
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c560416a495505caae4f068fd5c58b4aab552a43341f03d9f27bd27c64a28cc8
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58dc44076a97aa8560be589b10abfd196fdd4df11329f0a9f0516806ffc7c4e9
3
  size 623
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.16296886763208,
5
- "global_step": 23500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1416,11 +1416,701 @@
1416
  "learning_rate": 2.9185119574845e-05,
1417
  "loss": 2.191,
1418
  "step": 23500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1419
  }
1420
  ],
1421
  "max_steps": 56450,
1422
  "num_train_epochs": 10,
1423
- "total_flos": 8.72923785124373e+17,
1424
  "trial_name": null,
1425
  "trial_params": null
1426
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.2001682830698375,
5
+ "global_step": 35000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1416
  "learning_rate": 2.9185119574845e-05,
1417
  "loss": 2.191,
1418
  "step": 23500
1419
+ },
1420
+ {
1421
+ "epoch": 4.18,
1422
+ "learning_rate": 2.9096545615589016e-05,
1423
+ "loss": 2.1673,
1424
+ "step": 23600
1425
+ },
1426
+ {
1427
+ "epoch": 4.2,
1428
+ "learning_rate": 2.900797165633304e-05,
1429
+ "loss": 2.1964,
1430
+ "step": 23700
1431
+ },
1432
+ {
1433
+ "epoch": 4.22,
1434
+ "learning_rate": 2.891939769707706e-05,
1435
+ "loss": 2.21,
1436
+ "step": 23800
1437
+ },
1438
+ {
1439
+ "epoch": 4.23,
1440
+ "learning_rate": 2.8830823737821084e-05,
1441
+ "loss": 2.2209,
1442
+ "step": 23900
1443
+ },
1444
+ {
1445
+ "epoch": 4.25,
1446
+ "learning_rate": 2.8742249778565107e-05,
1447
+ "loss": 2.1892,
1448
+ "step": 24000
1449
+ },
1450
+ {
1451
+ "epoch": 4.27,
1452
+ "learning_rate": 2.8653675819309123e-05,
1453
+ "loss": 2.2222,
1454
+ "step": 24100
1455
+ },
1456
+ {
1457
+ "epoch": 4.29,
1458
+ "learning_rate": 2.8565101860053146e-05,
1459
+ "loss": 2.2133,
1460
+ "step": 24200
1461
+ },
1462
+ {
1463
+ "epoch": 4.3,
1464
+ "learning_rate": 2.847652790079717e-05,
1465
+ "loss": 2.2059,
1466
+ "step": 24300
1467
+ },
1468
+ {
1469
+ "epoch": 4.32,
1470
+ "learning_rate": 2.838795394154119e-05,
1471
+ "loss": 2.2117,
1472
+ "step": 24400
1473
+ },
1474
+ {
1475
+ "epoch": 4.34,
1476
+ "learning_rate": 2.8299379982285208e-05,
1477
+ "loss": 2.1632,
1478
+ "step": 24500
1479
+ },
1480
+ {
1481
+ "epoch": 4.36,
1482
+ "learning_rate": 2.821080602302923e-05,
1483
+ "loss": 2.2157,
1484
+ "step": 24600
1485
+ },
1486
+ {
1487
+ "epoch": 4.38,
1488
+ "learning_rate": 2.812223206377325e-05,
1489
+ "loss": 2.2001,
1490
+ "step": 24700
1491
+ },
1492
+ {
1493
+ "epoch": 4.39,
1494
+ "learning_rate": 2.8033658104517273e-05,
1495
+ "loss": 2.186,
1496
+ "step": 24800
1497
+ },
1498
+ {
1499
+ "epoch": 4.41,
1500
+ "learning_rate": 2.7945084145261296e-05,
1501
+ "loss": 2.1866,
1502
+ "step": 24900
1503
+ },
1504
+ {
1505
+ "epoch": 4.43,
1506
+ "learning_rate": 2.7856510186005312e-05,
1507
+ "loss": 2.1672,
1508
+ "step": 25000
1509
+ },
1510
+ {
1511
+ "epoch": 4.45,
1512
+ "learning_rate": 2.7767936226749335e-05,
1513
+ "loss": 2.1915,
1514
+ "step": 25100
1515
+ },
1516
+ {
1517
+ "epoch": 4.46,
1518
+ "learning_rate": 2.7679362267493358e-05,
1519
+ "loss": 2.1873,
1520
+ "step": 25200
1521
+ },
1522
+ {
1523
+ "epoch": 4.48,
1524
+ "learning_rate": 2.759078830823738e-05,
1525
+ "loss": 2.1836,
1526
+ "step": 25300
1527
+ },
1528
+ {
1529
+ "epoch": 4.5,
1530
+ "learning_rate": 2.7502214348981404e-05,
1531
+ "loss": 2.1446,
1532
+ "step": 25400
1533
+ },
1534
+ {
1535
+ "epoch": 4.52,
1536
+ "learning_rate": 2.741364038972542e-05,
1537
+ "loss": 2.1815,
1538
+ "step": 25500
1539
+ },
1540
+ {
1541
+ "epoch": 4.53,
1542
+ "learning_rate": 2.7325066430469442e-05,
1543
+ "loss": 2.1676,
1544
+ "step": 25600
1545
+ },
1546
+ {
1547
+ "epoch": 4.55,
1548
+ "learning_rate": 2.7236492471213465e-05,
1549
+ "loss": 2.1986,
1550
+ "step": 25700
1551
+ },
1552
+ {
1553
+ "epoch": 4.57,
1554
+ "learning_rate": 2.7147918511957488e-05,
1555
+ "loss": 2.1716,
1556
+ "step": 25800
1557
+ },
1558
+ {
1559
+ "epoch": 4.59,
1560
+ "learning_rate": 2.7059344552701504e-05,
1561
+ "loss": 2.1595,
1562
+ "step": 25900
1563
+ },
1564
+ {
1565
+ "epoch": 4.61,
1566
+ "learning_rate": 2.6970770593445527e-05,
1567
+ "loss": 2.1486,
1568
+ "step": 26000
1569
+ },
1570
+ {
1571
+ "epoch": 4.62,
1572
+ "learning_rate": 2.688219663418955e-05,
1573
+ "loss": 2.2041,
1574
+ "step": 26100
1575
+ },
1576
+ {
1577
+ "epoch": 4.64,
1578
+ "learning_rate": 2.6793622674933573e-05,
1579
+ "loss": 2.184,
1580
+ "step": 26200
1581
+ },
1582
+ {
1583
+ "epoch": 4.66,
1584
+ "learning_rate": 2.6705048715677596e-05,
1585
+ "loss": 2.1567,
1586
+ "step": 26300
1587
+ },
1588
+ {
1589
+ "epoch": 4.68,
1590
+ "learning_rate": 2.6616474756421612e-05,
1591
+ "loss": 2.1839,
1592
+ "step": 26400
1593
+ },
1594
+ {
1595
+ "epoch": 4.69,
1596
+ "learning_rate": 2.6527900797165635e-05,
1597
+ "loss": 2.1784,
1598
+ "step": 26500
1599
+ },
1600
+ {
1601
+ "epoch": 4.71,
1602
+ "learning_rate": 2.6439326837909658e-05,
1603
+ "loss": 2.1683,
1604
+ "step": 26600
1605
+ },
1606
+ {
1607
+ "epoch": 4.73,
1608
+ "learning_rate": 2.6350752878653677e-05,
1609
+ "loss": 2.197,
1610
+ "step": 26700
1611
+ },
1612
+ {
1613
+ "epoch": 4.75,
1614
+ "learning_rate": 2.62621789193977e-05,
1615
+ "loss": 2.2058,
1616
+ "step": 26800
1617
+ },
1618
+ {
1619
+ "epoch": 4.77,
1620
+ "learning_rate": 2.6173604960141716e-05,
1621
+ "loss": 2.1386,
1622
+ "step": 26900
1623
+ },
1624
+ {
1625
+ "epoch": 4.78,
1626
+ "learning_rate": 2.608503100088574e-05,
1627
+ "loss": 2.1782,
1628
+ "step": 27000
1629
+ },
1630
+ {
1631
+ "epoch": 4.8,
1632
+ "learning_rate": 2.5996457041629762e-05,
1633
+ "loss": 2.2116,
1634
+ "step": 27100
1635
+ },
1636
+ {
1637
+ "epoch": 4.82,
1638
+ "learning_rate": 2.5907883082373785e-05,
1639
+ "loss": 2.164,
1640
+ "step": 27200
1641
+ },
1642
+ {
1643
+ "epoch": 4.84,
1644
+ "learning_rate": 2.58193091231178e-05,
1645
+ "loss": 2.1668,
1646
+ "step": 27300
1647
+ },
1648
+ {
1649
+ "epoch": 4.85,
1650
+ "learning_rate": 2.5730735163861824e-05,
1651
+ "loss": 2.1444,
1652
+ "step": 27400
1653
+ },
1654
+ {
1655
+ "epoch": 4.87,
1656
+ "learning_rate": 2.5642161204605846e-05,
1657
+ "loss": 2.1801,
1658
+ "step": 27500
1659
+ },
1660
+ {
1661
+ "epoch": 4.89,
1662
+ "learning_rate": 2.555358724534987e-05,
1663
+ "loss": 2.1782,
1664
+ "step": 27600
1665
+ },
1666
+ {
1667
+ "epoch": 4.91,
1668
+ "learning_rate": 2.5465013286093892e-05,
1669
+ "loss": 2.1644,
1670
+ "step": 27700
1671
+ },
1672
+ {
1673
+ "epoch": 4.92,
1674
+ "learning_rate": 2.5376439326837908e-05,
1675
+ "loss": 2.1015,
1676
+ "step": 27800
1677
+ },
1678
+ {
1679
+ "epoch": 4.94,
1680
+ "learning_rate": 2.528786536758193e-05,
1681
+ "loss": 2.1643,
1682
+ "step": 27900
1683
+ },
1684
+ {
1685
+ "epoch": 4.96,
1686
+ "learning_rate": 2.5199291408325954e-05,
1687
+ "loss": 2.1822,
1688
+ "step": 28000
1689
+ },
1690
+ {
1691
+ "epoch": 4.98,
1692
+ "learning_rate": 2.5110717449069977e-05,
1693
+ "loss": 2.195,
1694
+ "step": 28100
1695
+ },
1696
+ {
1697
+ "epoch": 5.0,
1698
+ "learning_rate": 2.5022143489814e-05,
1699
+ "loss": 2.1692,
1700
+ "step": 28200
1701
+ },
1702
+ {
1703
+ "epoch": 5.01,
1704
+ "learning_rate": 2.493356953055802e-05,
1705
+ "loss": 2.1791,
1706
+ "step": 28300
1707
+ },
1708
+ {
1709
+ "epoch": 5.03,
1710
+ "learning_rate": 2.484499557130204e-05,
1711
+ "loss": 2.1332,
1712
+ "step": 28400
1713
+ },
1714
+ {
1715
+ "epoch": 5.05,
1716
+ "learning_rate": 2.475642161204606e-05,
1717
+ "loss": 2.1833,
1718
+ "step": 28500
1719
+ },
1720
+ {
1721
+ "epoch": 5.07,
1722
+ "learning_rate": 2.466784765279008e-05,
1723
+ "loss": 2.1712,
1724
+ "step": 28600
1725
+ },
1726
+ {
1727
+ "epoch": 5.08,
1728
+ "learning_rate": 2.4579273693534104e-05,
1729
+ "loss": 2.1017,
1730
+ "step": 28700
1731
+ },
1732
+ {
1733
+ "epoch": 5.1,
1734
+ "learning_rate": 2.4490699734278123e-05,
1735
+ "loss": 2.1789,
1736
+ "step": 28800
1737
+ },
1738
+ {
1739
+ "epoch": 5.12,
1740
+ "learning_rate": 2.4402125775022143e-05,
1741
+ "loss": 2.1292,
1742
+ "step": 28900
1743
+ },
1744
+ {
1745
+ "epoch": 5.14,
1746
+ "learning_rate": 2.4313551815766166e-05,
1747
+ "loss": 2.1539,
1748
+ "step": 29000
1749
+ },
1750
+ {
1751
+ "epoch": 5.15,
1752
+ "learning_rate": 2.4224977856510185e-05,
1753
+ "loss": 2.163,
1754
+ "step": 29100
1755
+ },
1756
+ {
1757
+ "epoch": 5.17,
1758
+ "learning_rate": 2.4136403897254208e-05,
1759
+ "loss": 2.1536,
1760
+ "step": 29200
1761
+ },
1762
+ {
1763
+ "epoch": 5.19,
1764
+ "learning_rate": 2.4047829937998228e-05,
1765
+ "loss": 2.1545,
1766
+ "step": 29300
1767
+ },
1768
+ {
1769
+ "epoch": 5.21,
1770
+ "learning_rate": 2.395925597874225e-05,
1771
+ "loss": 2.1721,
1772
+ "step": 29400
1773
+ },
1774
+ {
1775
+ "epoch": 5.23,
1776
+ "learning_rate": 2.3870682019486273e-05,
1777
+ "loss": 2.1453,
1778
+ "step": 29500
1779
+ },
1780
+ {
1781
+ "epoch": 5.24,
1782
+ "learning_rate": 2.3782108060230293e-05,
1783
+ "loss": 2.1464,
1784
+ "step": 29600
1785
+ },
1786
+ {
1787
+ "epoch": 5.26,
1788
+ "learning_rate": 2.3693534100974316e-05,
1789
+ "loss": 2.1423,
1790
+ "step": 29700
1791
+ },
1792
+ {
1793
+ "epoch": 5.28,
1794
+ "learning_rate": 2.3604960141718335e-05,
1795
+ "loss": 2.1569,
1796
+ "step": 29800
1797
+ },
1798
+ {
1799
+ "epoch": 5.3,
1800
+ "learning_rate": 2.3516386182462358e-05,
1801
+ "loss": 2.1246,
1802
+ "step": 29900
1803
+ },
1804
+ {
1805
+ "epoch": 5.31,
1806
+ "learning_rate": 2.3427812223206377e-05,
1807
+ "loss": 2.1579,
1808
+ "step": 30000
1809
+ },
1810
+ {
1811
+ "epoch": 5.33,
1812
+ "learning_rate": 2.33392382639504e-05,
1813
+ "loss": 2.1465,
1814
+ "step": 30100
1815
+ },
1816
+ {
1817
+ "epoch": 5.35,
1818
+ "learning_rate": 2.3250664304694423e-05,
1819
+ "loss": 2.1828,
1820
+ "step": 30200
1821
+ },
1822
+ {
1823
+ "epoch": 5.37,
1824
+ "learning_rate": 2.3162090345438443e-05,
1825
+ "loss": 2.1064,
1826
+ "step": 30300
1827
+ },
1828
+ {
1829
+ "epoch": 5.39,
1830
+ "learning_rate": 2.3073516386182465e-05,
1831
+ "loss": 2.1668,
1832
+ "step": 30400
1833
+ },
1834
+ {
1835
+ "epoch": 5.4,
1836
+ "learning_rate": 2.2984942426926485e-05,
1837
+ "loss": 2.1365,
1838
+ "step": 30500
1839
+ },
1840
+ {
1841
+ "epoch": 5.42,
1842
+ "learning_rate": 2.2896368467670508e-05,
1843
+ "loss": 2.1518,
1844
+ "step": 30600
1845
+ },
1846
+ {
1847
+ "epoch": 5.44,
1848
+ "learning_rate": 2.2807794508414527e-05,
1849
+ "loss": 2.0826,
1850
+ "step": 30700
1851
+ },
1852
+ {
1853
+ "epoch": 5.46,
1854
+ "learning_rate": 2.271922054915855e-05,
1855
+ "loss": 2.1492,
1856
+ "step": 30800
1857
+ },
1858
+ {
1859
+ "epoch": 5.47,
1860
+ "learning_rate": 2.263064658990257e-05,
1861
+ "loss": 2.1666,
1862
+ "step": 30900
1863
+ },
1864
+ {
1865
+ "epoch": 5.49,
1866
+ "learning_rate": 2.254207263064659e-05,
1867
+ "loss": 2.137,
1868
+ "step": 31000
1869
+ },
1870
+ {
1871
+ "epoch": 5.51,
1872
+ "learning_rate": 2.2453498671390612e-05,
1873
+ "loss": 2.1304,
1874
+ "step": 31100
1875
+ },
1876
+ {
1877
+ "epoch": 5.53,
1878
+ "learning_rate": 2.236492471213463e-05,
1879
+ "loss": 2.0874,
1880
+ "step": 31200
1881
+ },
1882
+ {
1883
+ "epoch": 5.54,
1884
+ "learning_rate": 2.2276350752878654e-05,
1885
+ "loss": 2.1058,
1886
+ "step": 31300
1887
+ },
1888
+ {
1889
+ "epoch": 5.56,
1890
+ "learning_rate": 2.2187776793622674e-05,
1891
+ "loss": 2.1319,
1892
+ "step": 31400
1893
+ },
1894
+ {
1895
+ "epoch": 5.58,
1896
+ "learning_rate": 2.2099202834366697e-05,
1897
+ "loss": 2.168,
1898
+ "step": 31500
1899
+ },
1900
+ {
1901
+ "epoch": 5.6,
1902
+ "learning_rate": 2.201062887511072e-05,
1903
+ "loss": 2.1371,
1904
+ "step": 31600
1905
+ },
1906
+ {
1907
+ "epoch": 5.62,
1908
+ "learning_rate": 2.192205491585474e-05,
1909
+ "loss": 2.1326,
1910
+ "step": 31700
1911
+ },
1912
+ {
1913
+ "epoch": 5.63,
1914
+ "learning_rate": 2.1833480956598762e-05,
1915
+ "loss": 2.1217,
1916
+ "step": 31800
1917
+ },
1918
+ {
1919
+ "epoch": 5.65,
1920
+ "learning_rate": 2.174490699734278e-05,
1921
+ "loss": 2.1494,
1922
+ "step": 31900
1923
+ },
1924
+ {
1925
+ "epoch": 5.67,
1926
+ "learning_rate": 2.1656333038086804e-05,
1927
+ "loss": 2.1289,
1928
+ "step": 32000
1929
+ },
1930
+ {
1931
+ "epoch": 5.69,
1932
+ "learning_rate": 2.1567759078830824e-05,
1933
+ "loss": 2.0962,
1934
+ "step": 32100
1935
+ },
1936
+ {
1937
+ "epoch": 5.7,
1938
+ "learning_rate": 2.1479185119574847e-05,
1939
+ "loss": 2.1188,
1940
+ "step": 32200
1941
+ },
1942
+ {
1943
+ "epoch": 5.72,
1944
+ "learning_rate": 2.139061116031887e-05,
1945
+ "loss": 2.1163,
1946
+ "step": 32300
1947
+ },
1948
+ {
1949
+ "epoch": 5.74,
1950
+ "learning_rate": 2.130203720106289e-05,
1951
+ "loss": 2.1361,
1952
+ "step": 32400
1953
+ },
1954
+ {
1955
+ "epoch": 5.76,
1956
+ "learning_rate": 2.1213463241806912e-05,
1957
+ "loss": 2.0793,
1958
+ "step": 32500
1959
+ },
1960
+ {
1961
+ "epoch": 5.77,
1962
+ "learning_rate": 2.112488928255093e-05,
1963
+ "loss": 2.0944,
1964
+ "step": 32600
1965
+ },
1966
+ {
1967
+ "epoch": 5.79,
1968
+ "learning_rate": 2.1036315323294954e-05,
1969
+ "loss": 2.145,
1970
+ "step": 32700
1971
+ },
1972
+ {
1973
+ "epoch": 5.81,
1974
+ "learning_rate": 2.0947741364038974e-05,
1975
+ "loss": 2.1493,
1976
+ "step": 32800
1977
+ },
1978
+ {
1979
+ "epoch": 5.83,
1980
+ "learning_rate": 2.0859167404782996e-05,
1981
+ "loss": 2.1309,
1982
+ "step": 32900
1983
+ },
1984
+ {
1985
+ "epoch": 5.85,
1986
+ "learning_rate": 2.0770593445527016e-05,
1987
+ "loss": 2.1422,
1988
+ "step": 33000
1989
+ },
1990
+ {
1991
+ "epoch": 5.86,
1992
+ "learning_rate": 2.0682019486271035e-05,
1993
+ "loss": 2.1383,
1994
+ "step": 33100
1995
+ },
1996
+ {
1997
+ "epoch": 5.88,
1998
+ "learning_rate": 2.0593445527015058e-05,
1999
+ "loss": 2.1247,
2000
+ "step": 33200
2001
+ },
2002
+ {
2003
+ "epoch": 5.9,
2004
+ "learning_rate": 2.0504871567759078e-05,
2005
+ "loss": 2.0899,
2006
+ "step": 33300
2007
+ },
2008
+ {
2009
+ "epoch": 5.92,
2010
+ "learning_rate": 2.04162976085031e-05,
2011
+ "loss": 2.1189,
2012
+ "step": 33400
2013
+ },
2014
+ {
2015
+ "epoch": 5.93,
2016
+ "learning_rate": 2.032772364924712e-05,
2017
+ "loss": 2.1178,
2018
+ "step": 33500
2019
+ },
2020
+ {
2021
+ "epoch": 5.95,
2022
+ "learning_rate": 2.0239149689991143e-05,
2023
+ "loss": 2.1256,
2024
+ "step": 33600
2025
+ },
2026
+ {
2027
+ "epoch": 5.97,
2028
+ "learning_rate": 2.0150575730735166e-05,
2029
+ "loss": 2.1403,
2030
+ "step": 33700
2031
+ },
2032
+ {
2033
+ "epoch": 5.99,
2034
+ "learning_rate": 2.0062001771479185e-05,
2035
+ "loss": 2.1421,
2036
+ "step": 33800
2037
+ },
2038
+ {
2039
+ "epoch": 6.01,
2040
+ "learning_rate": 1.9973427812223208e-05,
2041
+ "loss": 2.1434,
2042
+ "step": 33900
2043
+ },
2044
+ {
2045
+ "epoch": 6.02,
2046
+ "learning_rate": 1.9884853852967228e-05,
2047
+ "loss": 2.1076,
2048
+ "step": 34000
2049
+ },
2050
+ {
2051
+ "epoch": 6.04,
2052
+ "learning_rate": 1.979627989371125e-05,
2053
+ "loss": 2.0935,
2054
+ "step": 34100
2055
+ },
2056
+ {
2057
+ "epoch": 6.06,
2058
+ "learning_rate": 1.970770593445527e-05,
2059
+ "loss": 2.1139,
2060
+ "step": 34200
2061
+ },
2062
+ {
2063
+ "epoch": 6.08,
2064
+ "learning_rate": 1.9619131975199293e-05,
2065
+ "loss": 2.1342,
2066
+ "step": 34300
2067
+ },
2068
+ {
2069
+ "epoch": 6.09,
2070
+ "learning_rate": 1.9530558015943316e-05,
2071
+ "loss": 2.1207,
2072
+ "step": 34400
2073
+ },
2074
+ {
2075
+ "epoch": 6.11,
2076
+ "learning_rate": 1.9441984056687335e-05,
2077
+ "loss": 2.0827,
2078
+ "step": 34500
2079
+ },
2080
+ {
2081
+ "epoch": 6.13,
2082
+ "learning_rate": 1.9353410097431358e-05,
2083
+ "loss": 2.0926,
2084
+ "step": 34600
2085
+ },
2086
+ {
2087
+ "epoch": 6.15,
2088
+ "learning_rate": 1.9264836138175378e-05,
2089
+ "loss": 2.1078,
2090
+ "step": 34700
2091
+ },
2092
+ {
2093
+ "epoch": 6.16,
2094
+ "learning_rate": 1.91762621789194e-05,
2095
+ "loss": 2.0997,
2096
+ "step": 34800
2097
+ },
2098
+ {
2099
+ "epoch": 6.18,
2100
+ "learning_rate": 1.908768821966342e-05,
2101
+ "loss": 2.1346,
2102
+ "step": 34900
2103
+ },
2104
+ {
2105
+ "epoch": 6.2,
2106
+ "learning_rate": 1.8999114260407443e-05,
2107
+ "loss": 2.108,
2108
+ "step": 35000
2109
  }
2110
  ],
2111
  "max_steps": 56450,
2112
  "num_train_epochs": 10,
2113
+ "total_flos": 1.2999215903313946e+18,
2114
  "trial_name": null,
2115
  "trial_params": null
2116
  }