Bingsu commited on
Commit
809d256
1 Parent(s): 8325836

Training in progress, step 60000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baae4e4a96784040ce868f96d42034bb242c904f78e226021a2e4a96a9cef3c0
3
  size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ceedefb856fd84795e75aee417a2889e7dfef00f9cca82e610fbafac5203514
3
  size 100170757
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31c53fb7b6ad0de21b811e1658611e4a33f3b1de2bf1f9601b1aaa1ab1a4a342
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51f436fc6ff7c66c8286fca81fd6d00dc485176f29ebe17de85db28a4fa91b5
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:278efd6da406e01e44c9f984c5e1ca1bb12b34417f4813c1c41649e79e52efe7
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff9803e49c54da5b93ea63a8f9cfb55e640978474df5d52e215ba5da04a71f90
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:040f5262cd15419c3551e6c0666b578c1bcd600700fb7ef85d83f816cd92b640
3
  size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e429486456e317e2d30183574218e6d221698c823284eb9740704ef563e5d5d
3
  size 246897640
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.21486892995272883,
5
- "global_step": 50000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1506,11 +1506,311 @@
1506
  "learning_rate": 0.0002800029021041788,
1507
  "loss": 3.367,
1508
  "step": 50000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1509
  }
1510
  ],
1511
  "max_steps": 500000,
1512
  "num_train_epochs": 3,
1513
- "total_flos": 7.96914991104e+16,
1514
  "trial_name": null,
1515
  "trial_params": null
1516
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2578427159432746,
5
+ "global_step": 60000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1506
  "learning_rate": 0.0002800029021041788,
1507
  "loss": 3.367,
1508
  "step": 50000
1509
+ },
1510
+ {
1511
+ "epoch": 0.22,
1512
+ "learning_rate": 0.00028174626854371866,
1513
+ "loss": 3.3536,
1514
+ "step": 50200
1515
+ },
1516
+ {
1517
+ "epoch": 0.22,
1518
+ "learning_rate": 0.0002834938154241324,
1519
+ "loss": 3.3545,
1520
+ "step": 50400
1521
+ },
1522
+ {
1523
+ "epoch": 0.22,
1524
+ "learning_rate": 0.00028524551208266224,
1525
+ "loss": 3.369,
1526
+ "step": 50600
1527
+ },
1528
+ {
1529
+ "epoch": 0.22,
1530
+ "learning_rate": 0.0002870013277837379,
1531
+ "loss": 3.3578,
1532
+ "step": 50800
1533
+ },
1534
+ {
1535
+ "epoch": 0.22,
1536
+ "learning_rate": 0.00028876123171951576,
1537
+ "loss": 3.3439,
1538
+ "step": 51000
1539
+ },
1540
+ {
1541
+ "epoch": 0.22,
1542
+ "learning_rate": 0.0002905251930104192,
1543
+ "loss": 3.3568,
1544
+ "step": 51200
1545
+ },
1546
+ {
1547
+ "epoch": 0.22,
1548
+ "learning_rate": 0.00029229318070568056,
1549
+ "loss": 3.3605,
1550
+ "step": 51400
1551
+ },
1552
+ {
1553
+ "epoch": 0.22,
1554
+ "learning_rate": 0.0002940651637838844,
1555
+ "loss": 3.358,
1556
+ "step": 51600
1557
+ },
1558
+ {
1559
+ "epoch": 0.22,
1560
+ "learning_rate": 0.0002958411111535111,
1561
+ "loss": 3.3603,
1562
+ "step": 51800
1563
+ },
1564
+ {
1565
+ "epoch": 0.22,
1566
+ "learning_rate": 0.00029762099165348357,
1567
+ "loss": 3.3527,
1568
+ "step": 52000
1569
+ },
1570
+ {
1571
+ "epoch": 0.22,
1572
+ "learning_rate": 0.0002994047740537128,
1573
+ "loss": 3.3409,
1574
+ "step": 52200
1575
+ },
1576
+ {
1577
+ "epoch": 0.23,
1578
+ "learning_rate": 0.00030119242705564664,
1579
+ "loss": 3.3495,
1580
+ "step": 52400
1581
+ },
1582
+ {
1583
+ "epoch": 0.23,
1584
+ "learning_rate": 0.00030298391929281857,
1585
+ "loss": 3.328,
1586
+ "step": 52600
1587
+ },
1588
+ {
1589
+ "epoch": 0.23,
1590
+ "learning_rate": 0.00030477921933139823,
1591
+ "loss": 3.3592,
1592
+ "step": 52800
1593
+ },
1594
+ {
1595
+ "epoch": 0.23,
1596
+ "learning_rate": 0.00030657829567074305,
1597
+ "loss": 3.36,
1598
+ "step": 53000
1599
+ },
1600
+ {
1601
+ "epoch": 0.23,
1602
+ "learning_rate": 0.0003083811167439507,
1603
+ "loss": 3.3558,
1604
+ "step": 53200
1605
+ },
1606
+ {
1607
+ "epoch": 0.23,
1608
+ "learning_rate": 0.0003101876509184131,
1609
+ "loss": 3.3206,
1610
+ "step": 53400
1611
+ },
1612
+ {
1613
+ "epoch": 0.23,
1614
+ "learning_rate": 0.00031199786649637145,
1615
+ "loss": 3.3376,
1616
+ "step": 53600
1617
+ },
1618
+ {
1619
+ "epoch": 0.23,
1620
+ "learning_rate": 0.0003138117317154723,
1621
+ "loss": 3.3242,
1622
+ "step": 53800
1623
+ },
1624
+ {
1625
+ "epoch": 0.23,
1626
+ "learning_rate": 0.0003156292147493255,
1627
+ "loss": 3.3368,
1628
+ "step": 54000
1629
+ },
1630
+ {
1631
+ "epoch": 0.23,
1632
+ "learning_rate": 0.00031745028370806165,
1633
+ "loss": 3.3366,
1634
+ "step": 54200
1635
+ },
1636
+ {
1637
+ "epoch": 0.23,
1638
+ "learning_rate": 0.00031927490663889203,
1639
+ "loss": 3.3267,
1640
+ "step": 54400
1641
+ },
1642
+ {
1643
+ "epoch": 0.23,
1644
+ "learning_rate": 0.00032110305152666953,
1645
+ "loss": 3.302,
1646
+ "step": 54600
1647
+ },
1648
+ {
1649
+ "epoch": 0.24,
1650
+ "learning_rate": 0.00032293468629445007,
1651
+ "loss": 3.3237,
1652
+ "step": 54800
1653
+ },
1654
+ {
1655
+ "epoch": 0.24,
1656
+ "learning_rate": 0.00032476977880405546,
1657
+ "loss": 3.3207,
1658
+ "step": 55000
1659
+ },
1660
+ {
1661
+ "epoch": 0.24,
1662
+ "learning_rate": 0.00032660829685663773,
1663
+ "loss": 3.3215,
1664
+ "step": 55200
1665
+ },
1666
+ {
1667
+ "epoch": 0.24,
1668
+ "learning_rate": 0.00032845020819324334,
1669
+ "loss": 3.3107,
1670
+ "step": 55400
1671
+ },
1672
+ {
1673
+ "epoch": 0.24,
1674
+ "learning_rate": 0.0003302954804953797,
1675
+ "loss": 3.3153,
1676
+ "step": 55600
1677
+ },
1678
+ {
1679
+ "epoch": 0.24,
1680
+ "learning_rate": 0.00033214408138558256,
1681
+ "loss": 3.341,
1682
+ "step": 55800
1683
+ },
1684
+ {
1685
+ "epoch": 0.24,
1686
+ "learning_rate": 0.0003339959784279831,
1687
+ "loss": 3.3183,
1688
+ "step": 56000
1689
+ },
1690
+ {
1691
+ "epoch": 0.24,
1692
+ "learning_rate": 0.00033585113912887776,
1693
+ "loss": 3.3295,
1694
+ "step": 56200
1695
+ },
1696
+ {
1697
+ "epoch": 0.24,
1698
+ "learning_rate": 0.0003377095309372985,
1699
+ "loss": 3.3293,
1700
+ "step": 56400
1701
+ },
1702
+ {
1703
+ "epoch": 0.24,
1704
+ "learning_rate": 0.0003395711212455839,
1705
+ "loss": 3.3181,
1706
+ "step": 56600
1707
+ },
1708
+ {
1709
+ "epoch": 0.24,
1710
+ "learning_rate": 0.0003414358773899506,
1711
+ "loss": 3.3075,
1712
+ "step": 56800
1713
+ },
1714
+ {
1715
+ "epoch": 0.24,
1716
+ "learning_rate": 0.00034330376665106695,
1717
+ "loss": 3.3213,
1718
+ "step": 57000
1719
+ },
1720
+ {
1721
+ "epoch": 0.25,
1722
+ "learning_rate": 0.0003451747562546278,
1723
+ "loss": 3.3105,
1724
+ "step": 57200
1725
+ },
1726
+ {
1727
+ "epoch": 0.25,
1728
+ "learning_rate": 0.00034704881337192784,
1729
+ "loss": 3.2828,
1730
+ "step": 57400
1731
+ },
1732
+ {
1733
+ "epoch": 0.25,
1734
+ "learning_rate": 0.00034892590512043947,
1735
+ "loss": 3.3044,
1736
+ "step": 57600
1737
+ },
1738
+ {
1739
+ "epoch": 0.25,
1740
+ "learning_rate": 0.00035080599856438877,
1741
+ "loss": 3.306,
1742
+ "step": 57800
1743
+ },
1744
+ {
1745
+ "epoch": 0.25,
1746
+ "learning_rate": 0.00035268906071533304,
1747
+ "loss": 3.3226,
1748
+ "step": 58000
1749
+ },
1750
+ {
1751
+ "epoch": 0.25,
1752
+ "learning_rate": 0.0003545750585327406,
1753
+ "loss": 3.3227,
1754
+ "step": 58200
1755
+ },
1756
+ {
1757
+ "epoch": 0.25,
1758
+ "learning_rate": 0.0003564639589245703,
1759
+ "loss": 3.2888,
1760
+ "step": 58400
1761
+ },
1762
+ {
1763
+ "epoch": 0.25,
1764
+ "learning_rate": 0.0003583557287478512,
1765
+ "loss": 3.2918,
1766
+ "step": 58600
1767
+ },
1768
+ {
1769
+ "epoch": 0.25,
1770
+ "learning_rate": 0.0003602503348092654,
1771
+ "loss": 3.2857,
1772
+ "step": 58800
1773
+ },
1774
+ {
1775
+ "epoch": 0.25,
1776
+ "learning_rate": 0.0003621477438657296,
1777
+ "loss": 3.324,
1778
+ "step": 59000
1779
+ },
1780
+ {
1781
+ "epoch": 0.25,
1782
+ "learning_rate": 0.00036404792262497885,
1783
+ "loss": 3.2991,
1784
+ "step": 59200
1785
+ },
1786
+ {
1787
+ "epoch": 0.26,
1788
+ "learning_rate": 0.00036595083774615053,
1789
+ "loss": 3.2798,
1790
+ "step": 59400
1791
+ },
1792
+ {
1793
+ "epoch": 0.26,
1794
+ "learning_rate": 0.0003678564558403689,
1795
+ "loss": 3.3164,
1796
+ "step": 59600
1797
+ },
1798
+ {
1799
+ "epoch": 0.26,
1800
+ "learning_rate": 0.0003697647434713321,
1801
+ "loss": 3.2885,
1802
+ "step": 59800
1803
+ },
1804
+ {
1805
+ "epoch": 0.26,
1806
+ "learning_rate": 0.0003716756671558975,
1807
+ "loss": 3.3157,
1808
+ "step": 60000
1809
  }
1810
  ],
1811
  "max_steps": 500000,
1812
  "num_train_epochs": 3,
1813
+ "total_flos": 9.562979893248e+16,
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31c53fb7b6ad0de21b811e1658611e4a33f3b1de2bf1f9601b1aaa1ab1a4a342
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51f436fc6ff7c66c8286fca81fd6d00dc485176f29ebe17de85db28a4fa91b5
3
  size 146774203