Xibanya commited on
Commit
4c39b3e
1 Parent(s): a51c00e

Update trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +846 -12
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 30.0,
5
- "global_step": 19890,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1340,18 +1340,852 @@
1340
  "step": 19890
1341
  },
1342
  {
1343
- "epoch": 30.0,
1344
- "step": 19890,
1345
- "total_flos": 5197098516480000.0,
1346
- "train_loss": 0.14960067772038202,
1347
- "train_runtime": 189.9317,
1348
- "train_samples_per_second": 104.722,
1349
- "train_steps_per_second": 104.722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1350
  }
1351
  ],
1352
- "max_steps": 19890,
1353
- "num_train_epochs": 30,
1354
- "total_flos": 5197098516480000.0,
1355
  "trial_name": null,
1356
  "trial_params": null
1357
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 90.0,
5
+ "global_step": 29790,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1340
  "step": 19890
1341
  },
1342
  {
1343
+ "epoch": 60.12,
1344
+ "learning_rate": 8.693541350698569e-09,
1345
+ "loss": 1.5417,
1346
+ "step": 19900
1347
+ },
1348
+ {
1349
+ "epoch": 60.42,
1350
+ "learning_rate": 6.223106707028002e-08,
1351
+ "loss": 1.3927,
1352
+ "step": 20000
1353
+ },
1354
+ {
1355
+ "epoch": 60.73,
1356
+ "learning_rate": 1.2172156539717049e-07,
1357
+ "loss": 1.3567,
1358
+ "step": 20100
1359
+ },
1360
+ {
1361
+ "epoch": 61.0,
1362
+ "eval_loss": 3.4576382637023926,
1363
+ "eval_runtime": 0.952,
1364
+ "eval_samples_per_second": 35.715,
1365
+ "eval_steps_per_second": 35.715,
1366
+ "step": 20191
1367
+ },
1368
+ {
1369
+ "epoch": 61.03,
1370
+ "learning_rate": 1.3426435048201084e-07,
1371
+ "loss": 1.3891,
1372
+ "step": 20200
1373
+ },
1374
+ {
1375
+ "epoch": 61.33,
1376
+ "learning_rate": 8.877437479246501e-08,
1377
+ "loss": 1.3829,
1378
+ "step": 20300
1379
+ },
1380
+ {
1381
+ "epoch": 61.63,
1382
+ "learning_rate": 2.5454714388004032e-08,
1383
+ "loss": 1.3482,
1384
+ "step": 20400
1385
+ },
1386
+ {
1387
+ "epoch": 61.93,
1388
+ "learning_rate": 2.6594266338289214e-10,
1389
+ "loss": 1.3278,
1390
+ "step": 20500
1391
+ },
1392
+ {
1393
+ "epoch": 62.0,
1394
+ "eval_loss": 3.452810525894165,
1395
+ "eval_runtime": 0.964,
1396
+ "eval_samples_per_second": 35.269,
1397
+ "eval_steps_per_second": 35.269,
1398
+ "step": 20522
1399
+ },
1400
+ {
1401
+ "epoch": 62.24,
1402
+ "learning_rate": 3.54693620295034e-08,
1403
+ "loss": 1.3495,
1404
+ "step": 20600
1405
+ },
1406
+ {
1407
+ "epoch": 62.54,
1408
+ "learning_rate": 9.995293681528824e-08,
1409
+ "loss": 1.3372,
1410
+ "step": 20700
1411
+ },
1412
+ {
1413
+ "epoch": 62.84,
1414
+ "learning_rate": 1.3672745077248097e-07,
1415
+ "loss": 1.3292,
1416
+ "step": 20800
1417
+ },
1418
+ {
1419
+ "epoch": 63.0,
1420
+ "eval_loss": 3.4468021392822266,
1421
+ "eval_runtime": 0.9878,
1422
+ "eval_samples_per_second": 34.42,
1423
+ "eval_steps_per_second": 34.42,
1424
+ "step": 20853
1425
+ },
1426
+ {
1427
+ "epoch": 63.14,
1428
+ "learning_rate": 1.1329236813994995e-07,
1429
+ "loss": 1.3209,
1430
+ "step": 20900
1431
+ },
1432
+ {
1433
+ "epoch": 63.44,
1434
+ "learning_rate": 5.035911808981744e-08,
1435
+ "loss": 1.3127,
1436
+ "step": 21000
1437
+ },
1438
+ {
1439
+ "epoch": 63.75,
1440
+ "learning_rate": 3.5467727381560775e-09,
1441
+ "loss": 1.3285,
1442
+ "step": 21100
1443
+ },
1444
+ {
1445
+ "epoch": 64.0,
1446
+ "eval_loss": 3.443075180053711,
1447
+ "eval_runtime": 0.9684,
1448
+ "eval_samples_per_second": 35.111,
1449
+ "eval_steps_per_second": 35.111,
1450
+ "step": 21184
1451
+ },
1452
+ {
1453
+ "epoch": 64.05,
1454
+ "learning_rate": 1.4227090415615305e-08,
1455
+ "loss": 1.2899,
1456
+ "step": 21200
1457
+ },
1458
+ {
1459
+ "epoch": 64.35,
1460
+ "learning_rate": 7.296103279743161e-08,
1461
+ "loss": 1.3259,
1462
+ "step": 21300
1463
+ },
1464
+ {
1465
+ "epoch": 64.65,
1466
+ "learning_rate": 1.2784078685324094e-07,
1467
+ "loss": 1.308,
1468
+ "step": 21400
1469
+ },
1470
+ {
1471
+ "epoch": 64.95,
1472
+ "learning_rate": 1.303647894806954e-07,
1473
+ "loss": 1.3032,
1474
+ "step": 21500
1475
+ },
1476
+ {
1477
+ "epoch": 65.0,
1478
+ "eval_loss": 3.437013626098633,
1479
+ "eval_runtime": 0.9607,
1480
+ "eval_samples_per_second": 35.392,
1481
+ "eval_steps_per_second": 35.392,
1482
+ "step": 21515
1483
+ },
1484
+ {
1485
+ "epoch": 65.26,
1486
+ "learning_rate": 7.830238067025156e-08,
1487
+ "loss": 1.2946,
1488
+ "step": 21600
1489
+ },
1490
+ {
1491
+ "epoch": 65.56,
1492
+ "learning_rate": 1.766521352829689e-08,
1493
+ "loss": 1.3209,
1494
+ "step": 21700
1495
+ },
1496
+ {
1497
+ "epoch": 65.86,
1498
+ "learning_rate": 2.043130777021614e-09,
1499
+ "loss": 1.318,
1500
+ "step": 21800
1501
+ },
1502
+ {
1503
+ "epoch": 66.0,
1504
+ "eval_loss": 3.4345176219940186,
1505
+ "eval_runtime": 0.9583,
1506
+ "eval_samples_per_second": 35.479,
1507
+ "eval_steps_per_second": 35.479,
1508
+ "step": 21846
1509
+ },
1510
+ {
1511
+ "epoch": 66.16,
1512
+ "learning_rate": 4.5242597940677724e-08,
1513
+ "loss": 1.2637,
1514
+ "step": 21900
1515
+ },
1516
+ {
1517
+ "epoch": 66.47,
1518
+ "learning_rate": 1.090848418327998e-07,
1519
+ "loss": 1.2813,
1520
+ "step": 22000
1521
+ },
1522
+ {
1523
+ "epoch": 66.77,
1524
+ "learning_rate": 1.3714744085178092e-07,
1525
+ "loss": 1.3003,
1526
+ "step": 22100
1527
+ },
1528
+ {
1529
+ "epoch": 67.0,
1530
+ "eval_loss": 3.428901195526123,
1531
+ "eval_runtime": 0.9557,
1532
+ "eval_samples_per_second": 35.575,
1533
+ "eval_steps_per_second": 35.575,
1534
+ "step": 22177
1535
+ },
1536
+ {
1537
+ "epoch": 67.07,
1538
+ "learning_rate": 1.0462926495336562e-07,
1539
+ "loss": 1.3045,
1540
+ "step": 22200
1541
+ },
1542
+ {
1543
+ "epoch": 67.37,
1544
+ "learning_rate": 4.026918856301729e-08,
1545
+ "loss": 1.2654,
1546
+ "step": 22300
1547
+ },
1548
+ {
1549
+ "epoch": 67.67,
1550
+ "learning_rate": 9.472826654548543e-10,
1551
+ "loss": 1.2689,
1552
+ "step": 22400
1553
+ },
1554
+ {
1555
+ "epoch": 67.98,
1556
+ "learning_rate": 2.141541401022131e-08,
1557
+ "loss": 1.3202,
1558
+ "step": 22500
1559
+ },
1560
+ {
1561
+ "epoch": 68.0,
1562
+ "eval_loss": 3.427440881729126,
1563
+ "eval_runtime": 0.958,
1564
+ "eval_samples_per_second": 35.49,
1565
+ "eval_steps_per_second": 35.49,
1566
+ "step": 22508
1567
+ },
1568
+ {
1569
+ "epoch": 68.28,
1570
+ "learning_rate": 8.358428206955252e-08,
1571
+ "loss": 1.3008,
1572
+ "step": 22600
1573
+ },
1574
+ {
1575
+ "epoch": 68.58,
1576
+ "learning_rate": 1.3251035932481603e-07,
1577
+ "loss": 1.2628,
1578
+ "step": 22700
1579
+ },
1580
+ {
1581
+ "epoch": 68.88,
1582
+ "learning_rate": 1.2495381600320813e-07,
1583
+ "loss": 1.2643,
1584
+ "step": 22800
1585
+ },
1586
+ {
1587
+ "epoch": 69.0,
1588
+ "eval_loss": 3.4231972694396973,
1589
+ "eval_runtime": 0.9561,
1590
+ "eval_samples_per_second": 35.563,
1591
+ "eval_steps_per_second": 35.563,
1592
+ "step": 22839
1593
+ },
1594
+ {
1595
+ "epoch": 69.18,
1596
+ "learning_rate": 6.759296488244086e-08,
1597
+ "loss": 1.2717,
1598
+ "step": 22900
1599
+ },
1600
+ {
1601
+ "epoch": 69.49,
1602
+ "learning_rate": 1.1122110047934364e-08,
1603
+ "loss": 1.2759,
1604
+ "step": 23000
1605
+ },
1606
+ {
1607
+ "epoch": 69.79,
1608
+ "learning_rate": 5.44899573655199e-09,
1609
+ "loss": 1.2862,
1610
+ "step": 23100
1611
+ },
1612
+ {
1613
+ "epoch": 70.0,
1614
+ "eval_loss": 3.4223098754882812,
1615
+ "eval_runtime": 1.0568,
1616
+ "eval_samples_per_second": 32.174,
1617
+ "eval_steps_per_second": 32.174,
1618
+ "step": 23170
1619
+ },
1620
+ {
1621
+ "epoch": 70.09,
1622
+ "learning_rate": 5.492884226688333e-08,
1623
+ "loss": 1.2962,
1624
+ "step": 23200
1625
+ },
1626
+ {
1627
+ "epoch": 70.39,
1628
+ "learning_rate": 1.1675016817694061e-07,
1629
+ "loss": 1.2762,
1630
+ "step": 23300
1631
+ },
1632
+ {
1633
+ "epoch": 70.69,
1634
+ "learning_rate": 1.3601739630719386e-07,
1635
+ "loss": 1.2783,
1636
+ "step": 23400
1637
+ },
1638
+ {
1639
+ "epoch": 71.0,
1640
+ "learning_rate": 9.570255892993473e-08,
1641
+ "loss": 1.2597,
1642
+ "step": 23500
1643
+ },
1644
+ {
1645
+ "epoch": 71.0,
1646
+ "eval_loss": 3.418621063232422,
1647
+ "eval_runtime": 1.0452,
1648
+ "eval_samples_per_second": 32.531,
1649
+ "eval_steps_per_second": 32.531,
1650
+ "step": 23501
1651
+ },
1652
+ {
1653
+ "epoch": 71.3,
1654
+ "learning_rate": 3.143505458484596e-08,
1655
+ "loss": 1.3153,
1656
+ "step": 23600
1657
+ },
1658
+ {
1659
+ "epoch": 71.6,
1660
+ "learning_rate": 1.3141045706995258e-11,
1661
+ "loss": 1.245,
1662
+ "step": 23700
1663
+ },
1664
+ {
1665
+ "epoch": 71.9,
1666
+ "learning_rate": 2.920683930724396e-08,
1667
+ "loss": 1.2426,
1668
+ "step": 23800
1669
+ },
1670
+ {
1671
+ "epoch": 72.0,
1672
+ "eval_loss": 3.4176230430603027,
1673
+ "eval_runtime": 1.0483,
1674
+ "eval_samples_per_second": 32.434,
1675
+ "eval_steps_per_second": 32.434,
1676
+ "step": 23832
1677
+ },
1678
+ {
1679
+ "epoch": 72.21,
1680
+ "learning_rate": 9.32153777956087e-08,
1681
+ "loss": 1.276,
1682
+ "step": 23900
1683
+ },
1684
+ {
1685
+ "epoch": 72.51,
1686
+ "learning_rate": 1.3546936727073083e-07,
1687
+ "loss": 1.2707,
1688
+ "step": 24000
1689
+ },
1690
+ {
1691
+ "epoch": 72.81,
1692
+ "learning_rate": 1.1862562768173958e-07,
1693
+ "loss": 1.2539,
1694
+ "step": 24100
1695
+ },
1696
+ {
1697
+ "epoch": 73.0,
1698
+ "eval_loss": 3.4151535034179688,
1699
+ "eval_runtime": 1.0259,
1700
+ "eval_samples_per_second": 33.143,
1701
+ "eval_steps_per_second": 33.143,
1702
+ "step": 24163
1703
+ },
1704
+ {
1705
+ "epoch": 73.11,
1706
+ "learning_rate": 5.7570298965124436e-08,
1707
+ "loss": 1.2827,
1708
+ "step": 24200
1709
+ },
1710
+ {
1711
+ "epoch": 73.41,
1712
+ "learning_rate": 6.262786177113756e-09,
1713
+ "loss": 1.2277,
1714
+ "step": 24300
1715
+ },
1716
+ {
1717
+ "epoch": 73.72,
1718
+ "learning_rate": 1.0047581324385604e-08,
1719
+ "loss": 1.2604,
1720
+ "step": 24400
1721
+ },
1722
+ {
1723
+ "epoch": 74.0,
1724
+ "eval_loss": 3.414668083190918,
1725
+ "eval_runtime": 1.0049,
1726
+ "eval_samples_per_second": 33.834,
1727
+ "eval_steps_per_second": 33.834,
1728
+ "step": 24494
1729
+ },
1730
+ {
1731
+ "epoch": 74.02,
1732
+ "learning_rate": 6.55797626990514e-08,
1733
+ "loss": 1.2588,
1734
+ "step": 24500
1735
+ },
1736
+ {
1737
+ "epoch": 74.32,
1738
+ "learning_rate": 1.2378116576678553e-07,
1739
+ "loss": 1.2695,
1740
+ "step": 24600
1741
+ },
1742
+ {
1743
+ "epoch": 74.62,
1744
+ "learning_rate": 1.3321462445238648e-07,
1745
+ "loss": 1.2576,
1746
+ "step": 24700
1747
+ },
1748
+ {
1749
+ "epoch": 74.92,
1750
+ "learning_rate": 8.554304797283439e-08,
1751
+ "loss": 1.263,
1752
+ "step": 24800
1753
+ },
1754
+ {
1755
+ "epoch": 75.0,
1756
+ "eval_loss": 3.412790298461914,
1757
+ "eval_runtime": 1.012,
1758
+ "eval_samples_per_second": 33.597,
1759
+ "eval_steps_per_second": 33.597,
1760
+ "step": 24825
1761
+ },
1762
+ {
1763
+ "epoch": 75.23,
1764
+ "learning_rate": 2.2897564912636232e-08,
1765
+ "loss": 1.2362,
1766
+ "step": 24900
1767
+ },
1768
+ {
1769
+ "epoch": 75.53,
1770
+ "learning_rate": 6.429250232458032e-10,
1771
+ "loss": 1.2653,
1772
+ "step": 25000
1773
+ },
1774
+ {
1775
+ "epoch": 75.83,
1776
+ "learning_rate": 3.844730720753521e-08,
1777
+ "loss": 1.2642,
1778
+ "step": 25100
1779
+ },
1780
+ {
1781
+ "epoch": 76.0,
1782
+ "eval_loss": 3.412682294845581,
1783
+ "eval_runtime": 1.006,
1784
+ "eval_samples_per_second": 33.797,
1785
+ "eval_steps_per_second": 33.797,
1786
+ "step": 25156
1787
+ },
1788
+ {
1789
+ "epoch": 76.13,
1790
+ "learning_rate": 1.0289999999999885e-07,
1791
+ "loss": 1.2254,
1792
+ "step": 25200
1793
+ },
1794
+ {
1795
+ "epoch": 76.44,
1796
+ "learning_rate": 1.370390800033334e-07,
1797
+ "loss": 1.2543,
1798
+ "step": 25300
1799
+ },
1800
+ {
1801
+ "epoch": 76.74,
1802
+ "learning_rate": 1.106931521097187e-07,
1803
+ "loss": 1.2694,
1804
+ "step": 25400
1805
+ },
1806
+ {
1807
+ "epoch": 77.0,
1808
+ "eval_loss": 3.4108803272247314,
1809
+ "eval_runtime": 1.004,
1810
+ "eval_samples_per_second": 33.865,
1811
+ "eval_steps_per_second": 33.865,
1812
+ "step": 25487
1813
+ },
1814
+ {
1815
+ "epoch": 77.04,
1816
+ "learning_rate": 4.7146188818785276e-08,
1817
+ "loss": 1.25,
1818
+ "step": 25500
1819
+ },
1820
+ {
1821
+ "epoch": 77.34,
1822
+ "learning_rate": 2.559648997196144e-09,
1823
+ "loss": 1.2614,
1824
+ "step": 25600
1825
+ },
1826
+ {
1827
+ "epoch": 77.64,
1828
+ "learning_rate": 1.6338171238500445e-08,
1829
+ "loss": 1.2549,
1830
+ "step": 25700
1831
+ },
1832
+ {
1833
+ "epoch": 77.95,
1834
+ "learning_rate": 7.63045897109416e-08,
1835
+ "loss": 1.2251,
1836
+ "step": 25800
1837
+ },
1838
+ {
1839
+ "epoch": 78.0,
1840
+ "eval_loss": 3.4106197357177734,
1841
+ "eval_runtime": 1.0035,
1842
+ "eval_samples_per_second": 33.882,
1843
+ "eval_steps_per_second": 33.882,
1844
+ "step": 25818
1845
+ },
1846
+ {
1847
+ "epoch": 78.25,
1848
+ "learning_rate": 1.2946185514557884e-07,
1849
+ "loss": 1.2965,
1850
+ "step": 25900
1851
+ },
1852
+ {
1853
+ "epoch": 78.55,
1854
+ "learning_rate": 1.288307033641468e-07,
1855
+ "loss": 1.2137,
1856
+ "step": 26000
1857
+ },
1858
+ {
1859
+ "epoch": 78.85,
1860
+ "learning_rate": 7.496893292971942e-08,
1861
+ "loss": 1.2673,
1862
+ "step": 26100
1863
+ },
1864
+ {
1865
+ "epoch": 79.0,
1866
+ "eval_loss": 3.409691333770752,
1867
+ "eval_runtime": 1.0062,
1868
+ "eval_samples_per_second": 33.792,
1869
+ "eval_steps_per_second": 33.792,
1870
+ "step": 26149
1871
+ },
1872
+ {
1873
+ "epoch": 79.15,
1874
+ "learning_rate": 1.5478434602829786e-08,
1875
+ "loss": 1.225,
1876
+ "step": 26200
1877
+ },
1878
+ {
1879
+ "epoch": 79.46,
1880
+ "learning_rate": 2.9356495179890543e-09,
1881
+ "loss": 1.2545,
1882
+ "step": 26300
1883
+ },
1884
+ {
1885
+ "epoch": 79.76,
1886
+ "learning_rate": 4.842562520753367e-08,
1887
+ "loss": 1.233,
1888
+ "step": 26400
1889
+ },
1890
+ {
1891
+ "epoch": 80.0,
1892
+ "eval_loss": 3.409552574157715,
1893
+ "eval_runtime": 1.0059,
1894
+ "eval_samples_per_second": 33.8,
1895
+ "eval_steps_per_second": 33.8,
1896
+ "step": 26480
1897
+ },
1898
+ {
1899
+ "epoch": 80.06,
1900
+ "learning_rate": 1.1174528561199565e-07,
1901
+ "loss": 1.2286,
1902
+ "step": 26500
1903
+ },
1904
+ {
1905
+ "epoch": 80.36,
1906
+ "learning_rate": 1.3693405733661706e-07,
1907
+ "loss": 1.2464,
1908
+ "step": 26600
1909
+ },
1910
+ {
1911
+ "epoch": 80.66,
1912
+ "learning_rate": 1.0173063797049695e-07,
1913
+ "loss": 1.2497,
1914
+ "step": 26700
1915
+ },
1916
+ {
1917
+ "epoch": 80.97,
1918
+ "learning_rate": 3.7247063184712124e-08,
1919
+ "loss": 1.2408,
1920
+ "step": 26800
1921
+ },
1922
+ {
1923
+ "epoch": 81.0,
1924
+ "eval_loss": 3.4087181091308594,
1925
+ "eval_runtime": 1.0321,
1926
+ "eval_samples_per_second": 32.942,
1927
+ "eval_steps_per_second": 32.942,
1928
+ "step": 26811
1929
+ },
1930
+ {
1931
+ "epoch": 81.27,
1932
+ "learning_rate": 4.725492275192098e-10,
1933
+ "loss": 1.2461,
1934
+ "step": 26900
1935
+ },
1936
+ {
1937
+ "epoch": 81.57,
1938
+ "learning_rate": 2.3400433722385994e-08,
1939
+ "loss": 1.2495,
1940
+ "step": 27000
1941
+ },
1942
+ {
1943
+ "epoch": 81.87,
1944
+ "learning_rate": 8.619280748313942e-08,
1945
+ "loss": 1.2579,
1946
+ "step": 27100
1947
+ },
1948
+ {
1949
+ "epoch": 82.0,
1950
+ "eval_loss": 3.408783435821533,
1951
+ "eval_runtime": 1.0924,
1952
+ "eval_samples_per_second": 31.123,
1953
+ "eval_steps_per_second": 31.123,
1954
+ "step": 27142
1955
+ },
1956
+ {
1957
+ "epoch": 82.18,
1958
+ "learning_rate": 1.3343703098390236e-07,
1959
+ "loss": 1.2153,
1960
+ "step": 27200
1961
+ },
1962
+ {
1963
+ "epoch": 82.48,
1964
+ "learning_rate": 1.233796611423087e-07,
1965
+ "loss": 1.2398,
1966
+ "step": 27300
1967
+ },
1968
+ {
1969
+ "epoch": 82.78,
1970
+ "learning_rate": 6.490918819320101e-08,
1971
+ "loss": 1.2346,
1972
+ "step": 27400
1973
+ },
1974
+ {
1975
+ "epoch": 83.0,
1976
+ "eval_loss": 3.408099412918091,
1977
+ "eval_runtime": 1.0406,
1978
+ "eval_samples_per_second": 32.673,
1979
+ "eval_steps_per_second": 32.673,
1980
+ "step": 27473
1981
+ },
1982
+ {
1983
+ "epoch": 83.08,
1984
+ "learning_rate": 9.700576468258166e-09,
1985
+ "loss": 1.2543,
1986
+ "step": 27500
1987
+ },
1988
+ {
1989
+ "epoch": 83.38,
1990
+ "learning_rate": 6.546026505537958e-09,
1991
+ "loss": 1.247,
1992
+ "step": 27600
1993
+ },
1994
+ {
1995
+ "epoch": 83.69,
1996
+ "learning_rate": 5.823346268095307e-08,
1997
+ "loss": 1.2225,
1998
+ "step": 27700
1999
+ },
2000
+ {
2001
+ "epoch": 83.99,
2002
+ "learning_rate": 1.190826247529784e-07,
2003
+ "loss": 1.2298,
2004
+ "step": 27800
2005
+ },
2006
+ {
2007
+ "epoch": 84.0,
2008
+ "eval_loss": 3.4082374572753906,
2009
+ "eval_runtime": 1.0242,
2010
+ "eval_samples_per_second": 33.197,
2011
+ "eval_steps_per_second": 33.197,
2012
+ "step": 27804
2013
+ },
2014
+ {
2015
+ "epoch": 84.29,
2016
+ "learning_rate": 1.3531631337483422e-07,
2017
+ "loss": 1.2426,
2018
+ "step": 27900
2019
+ },
2020
+ {
2021
+ "epoch": 84.59,
2022
+ "learning_rate": 9.258753872080096e-08,
2023
+ "loss": 1.2389,
2024
+ "step": 28000
2025
+ },
2026
+ {
2027
+ "epoch": 84.89,
2028
+ "learning_rate": 2.865908591672022e-08,
2029
+ "loss": 1.219,
2030
+ "step": 28100
2031
+ },
2032
+ {
2033
+ "epoch": 85.0,
2034
+ "eval_loss": 3.407871961593628,
2035
+ "eval_runtime": 0.9538,
2036
+ "eval_samples_per_second": 35.648,
2037
+ "eval_steps_per_second": 35.648,
2038
+ "step": 28135
2039
+ },
2040
+ {
2041
+ "epoch": 85.2,
2042
+ "learning_rate": 2.956617282787915e-11,
2043
+ "loss": 1.2712,
2044
+ "step": 28200
2045
+ },
2046
+ {
2047
+ "epoch": 85.5,
2048
+ "learning_rate": 3.2001142050499094e-08,
2049
+ "loss": 1.2184,
2050
+ "step": 28300
2051
+ },
2052
+ {
2053
+ "epoch": 85.8,
2054
+ "learning_rate": 9.631801263387146e-08,
2055
+ "loss": 1.2515,
2056
+ "step": 28400
2057
+ },
2058
+ {
2059
+ "epoch": 86.0,
2060
+ "eval_loss": 3.4080073833465576,
2061
+ "eval_runtime": 0.951,
2062
+ "eval_samples_per_second": 35.752,
2063
+ "eval_steps_per_second": 35.752,
2064
+ "step": 28466
2065
+ },
2066
+ {
2067
+ "epoch": 86.1,
2068
+ "learning_rate": 1.361382913144042e-07,
2069
+ "loss": 1.2196,
2070
+ "step": 28500
2071
+ },
2072
+ {
2073
+ "epoch": 86.4,
2074
+ "learning_rate": 1.162696600432059e-07,
2075
+ "loss": 1.2085,
2076
+ "step": 28600
2077
+ },
2078
+ {
2079
+ "epoch": 86.71,
2080
+ "learning_rate": 5.427159389331479e-08,
2081
+ "loss": 1.2316,
2082
+ "step": 28700
2083
+ },
2084
+ {
2085
+ "epoch": 87.0,
2086
+ "eval_loss": 3.408378839492798,
2087
+ "eval_runtime": 0.9581,
2088
+ "eval_samples_per_second": 35.486,
2089
+ "eval_steps_per_second": 35.486,
2090
+ "step": 28797
2091
+ },
2092
+ {
2093
+ "epoch": 87.01,
2094
+ "learning_rate": 4.936669277050985e-09,
2095
+ "loss": 1.2578,
2096
+ "step": 28800
2097
+ },
2098
+ {
2099
+ "epoch": 87.31,
2100
+ "learning_rate": 1.1866046596701257e-08,
2101
+ "loss": 1.24,
2102
+ "step": 28900
2103
+ },
2104
+ {
2105
+ "epoch": 87.61,
2106
+ "learning_rate": 6.893568909023435e-08,
2107
+ "loss": 1.2342,
2108
+ "step": 29000
2109
+ },
2110
+ {
2111
+ "epoch": 87.92,
2112
+ "learning_rate": 1.2570865668292598e-07,
2113
+ "loss": 1.2085,
2114
+ "step": 29100
2115
+ },
2116
+ {
2117
+ "epoch": 88.0,
2118
+ "eval_loss": 3.4085471630096436,
2119
+ "eval_runtime": 1.0334,
2120
+ "eval_samples_per_second": 32.901,
2121
+ "eval_steps_per_second": 32.901,
2122
+ "step": 29128
2123
+ },
2124
+ {
2125
+ "epoch": 88.22,
2126
+ "learning_rate": 1.32010204287465e-07,
2127
+ "loss": 1.2504,
2128
+ "step": 29200
2129
+ },
2130
+ {
2131
+ "epoch": 88.52,
2132
+ "learning_rate": 8.227115773311707e-08,
2133
+ "loss": 1.2419,
2134
+ "step": 29300
2135
+ },
2136
+ {
2137
+ "epoch": 88.82,
2138
+ "learning_rate": 2.0449831823058302e-08,
2139
+ "loss": 1.2334,
2140
+ "step": 29400
2141
+ },
2142
+ {
2143
+ "epoch": 89.0,
2144
+ "eval_loss": 3.4085144996643066,
2145
+ "eval_runtime": 1.0283,
2146
+ "eval_samples_per_second": 33.063,
2147
+ "eval_steps_per_second": 33.063,
2148
+ "step": 29459
2149
+ },
2150
+ {
2151
+ "epoch": 89.12,
2152
+ "learning_rate": 1.1826036928064389e-09,
2153
+ "loss": 1.2213,
2154
+ "step": 29500
2155
+ },
2156
+ {
2157
+ "epoch": 89.43,
2158
+ "learning_rate": 4.088198736612909e-08,
2159
+ "loss": 1.2165,
2160
+ "step": 29600
2161
+ },
2162
+ {
2163
+ "epoch": 89.73,
2164
+ "learning_rate": 1.0519885794950143e-07,
2165
+ "loss": 1.2263,
2166
+ "step": 29700
2167
+ },
2168
+ {
2169
+ "epoch": 90.0,
2170
+ "eval_loss": 3.408367156982422,
2171
+ "eval_runtime": 1.0186,
2172
+ "eval_samples_per_second": 33.38,
2173
+ "eval_steps_per_second": 33.38,
2174
+ "step": 29790
2175
+ },
2176
+ {
2177
+ "epoch": 90.0,
2178
+ "step": 29790,
2179
+ "total_flos": 1.037068075008e+16,
2180
+ "train_loss": 0.13739024364296004,
2181
+ "train_runtime": 461.3754,
2182
+ "train_samples_per_second": 64.568,
2183
+ "train_steps_per_second": 64.568
2184
  }
2185
  ],
2186
+ "max_steps": 29790,
2187
+ "num_train_epochs": 90,
2188
+ "total_flos": 1.037068075008e+16,
2189
  "trial_name": null,
2190
  "trial_params": null
2191
  }