Training in progress, step 6000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1520630616
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9310a4b888df283774971e4e671540bfed2da01aea080fa39eda067305eeba86
|
| 3 |
size 1520630616
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3041448587
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1f256b63f8887aa92c9795198c14b259ff29bd76f4e601214dd8ad4add4ccd6
|
| 3 |
size 3041448587
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2669ee2d37691d1bc42e7a0090a126e105acbd5de1cf305e31cb6b68e55636b7
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a30b126d1da8ae8870320a9f300ee7d428169650eb20c3a488c09fc00bef14d8
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -568,6 +568,286 @@
|
|
| 568 |
"learning_rate": 0.00029976166518534735,
|
| 569 |
"loss": 2.4739,
|
| 570 |
"step": 4000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
}
|
| 572 |
],
|
| 573 |
"logging_steps": 50,
|
|
@@ -587,7 +867,7 @@
|
|
| 587 |
"attributes": {}
|
| 588 |
}
|
| 589 |
},
|
| 590 |
-
"total_flos":
|
| 591 |
"train_batch_size": 16,
|
| 592 |
"trial_name": null,
|
| 593 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0279453551912567,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 6000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 568 |
"learning_rate": 0.00029976166518534735,
|
| 569 |
"loss": 2.4739,
|
| 570 |
"step": 4000
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 1.0066338797814207,
|
| 574 |
+
"grad_norm": 0.5546875,
|
| 575 |
+
"learning_rate": 0.00029974653116842764,
|
| 576 |
+
"loss": 2.4487,
|
| 577 |
+
"step": 4050
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 1.0071803278688525,
|
| 581 |
+
"grad_norm": 0.60546875,
|
| 582 |
+
"learning_rate": 0.0002997309317358347,
|
| 583 |
+
"loss": 2.4674,
|
| 584 |
+
"step": 4100
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 1.0077267759562842,
|
| 588 |
+
"grad_norm": 0.67578125,
|
| 589 |
+
"learning_rate": 0.0002997148669360519,
|
| 590 |
+
"loss": 2.4814,
|
| 591 |
+
"step": 4150
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 1.0082732240437158,
|
| 595 |
+
"grad_norm": 0.53515625,
|
| 596 |
+
"learning_rate": 0.00029969833681900914,
|
| 597 |
+
"loss": 2.448,
|
| 598 |
+
"step": 4200
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 1.0088196721311475,
|
| 602 |
+
"grad_norm": 0.68359375,
|
| 603 |
+
"learning_rate": 0.0002996813414360822,
|
| 604 |
+
"loss": 2.4299,
|
| 605 |
+
"step": 4250
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 1.0093661202185793,
|
| 609 |
+
"grad_norm": 0.57421875,
|
| 610 |
+
"learning_rate": 0.00029966388084009334,
|
| 611 |
+
"loss": 2.4271,
|
| 612 |
+
"step": 4300
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 1.0099125683060108,
|
| 616 |
+
"grad_norm": 0.515625,
|
| 617 |
+
"learning_rate": 0.00029964595508531034,
|
| 618 |
+
"loss": 2.4848,
|
| 619 |
+
"step": 4350
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 1.0104590163934426,
|
| 623 |
+
"grad_norm": 0.53125,
|
| 624 |
+
"learning_rate": 0.00029962756422744695,
|
| 625 |
+
"loss": 2.414,
|
| 626 |
+
"step": 4400
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 1.0110054644808744,
|
| 630 |
+
"grad_norm": 0.51171875,
|
| 631 |
+
"learning_rate": 0.00029960870832366224,
|
| 632 |
+
"loss": 2.3993,
|
| 633 |
+
"step": 4450
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 1.0115519125683061,
|
| 637 |
+
"grad_norm": 0.54296875,
|
| 638 |
+
"learning_rate": 0.000299589387432561,
|
| 639 |
+
"loss": 2.4171,
|
| 640 |
+
"step": 4500
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 1.0120983606557377,
|
| 644 |
+
"grad_norm": 0.490234375,
|
| 645 |
+
"learning_rate": 0.00029956960161419283,
|
| 646 |
+
"loss": 2.4038,
|
| 647 |
+
"step": 4550
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 1.0126448087431694,
|
| 651 |
+
"grad_norm": 0.498046875,
|
| 652 |
+
"learning_rate": 0.0002995493509300526,
|
| 653 |
+
"loss": 2.4128,
|
| 654 |
+
"step": 4600
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 1.0131912568306012,
|
| 658 |
+
"grad_norm": 0.51171875,
|
| 659 |
+
"learning_rate": 0.0002995286354430799,
|
| 660 |
+
"loss": 2.3721,
|
| 661 |
+
"step": 4650
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 1.0137377049180327,
|
| 665 |
+
"grad_norm": 0.51171875,
|
| 666 |
+
"learning_rate": 0.0002995074552176589,
|
| 667 |
+
"loss": 2.3734,
|
| 668 |
+
"step": 4700
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"epoch": 1.0142841530054645,
|
| 672 |
+
"grad_norm": 0.515625,
|
| 673 |
+
"learning_rate": 0.00029948581031961826,
|
| 674 |
+
"loss": 2.3805,
|
| 675 |
+
"step": 4750
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"epoch": 1.0148306010928962,
|
| 679 |
+
"grad_norm": 0.515625,
|
| 680 |
+
"learning_rate": 0.0002994637008162308,
|
| 681 |
+
"loss": 2.3819,
|
| 682 |
+
"step": 4800
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 1.0153770491803278,
|
| 686 |
+
"grad_norm": 0.53515625,
|
| 687 |
+
"learning_rate": 0.00029944112677621345,
|
| 688 |
+
"loss": 2.3839,
|
| 689 |
+
"step": 4850
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"epoch": 1.0159234972677595,
|
| 693 |
+
"grad_norm": 0.4921875,
|
| 694 |
+
"learning_rate": 0.00029941808826972673,
|
| 695 |
+
"loss": 2.336,
|
| 696 |
+
"step": 4900
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"epoch": 1.0164699453551913,
|
| 700 |
+
"grad_norm": 0.515625,
|
| 701 |
+
"learning_rate": 0.0002993945853683749,
|
| 702 |
+
"loss": 2.3126,
|
| 703 |
+
"step": 4950
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 1.0170163934426228,
|
| 707 |
+
"grad_norm": 0.53515625,
|
| 708 |
+
"learning_rate": 0.00029937061814520546,
|
| 709 |
+
"loss": 2.3271,
|
| 710 |
+
"step": 5000
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"epoch": 1.0175628415300546,
|
| 714 |
+
"grad_norm": 0.53125,
|
| 715 |
+
"learning_rate": 0.00029934618667470925,
|
| 716 |
+
"loss": 2.3275,
|
| 717 |
+
"step": 5050
|
| 718 |
+
},
|
| 719 |
+
{
|
| 720 |
+
"epoch": 1.0181092896174864,
|
| 721 |
+
"grad_norm": 0.51953125,
|
| 722 |
+
"learning_rate": 0.0002993212910328197,
|
| 723 |
+
"loss": 2.2837,
|
| 724 |
+
"step": 5100
|
| 725 |
+
},
|
| 726 |
+
{
|
| 727 |
+
"epoch": 1.0186557377049181,
|
| 728 |
+
"grad_norm": 0.56640625,
|
| 729 |
+
"learning_rate": 0.00029929593129691305,
|
| 730 |
+
"loss": 2.2964,
|
| 731 |
+
"step": 5150
|
| 732 |
+
},
|
| 733 |
+
{
|
| 734 |
+
"epoch": 1.0192021857923497,
|
| 735 |
+
"grad_norm": 0.50390625,
|
| 736 |
+
"learning_rate": 0.000299270107545808,
|
| 737 |
+
"loss": 2.3155,
|
| 738 |
+
"step": 5200
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 1.0197486338797814,
|
| 742 |
+
"grad_norm": 0.55078125,
|
| 743 |
+
"learning_rate": 0.00029924381985976534,
|
| 744 |
+
"loss": 2.2722,
|
| 745 |
+
"step": 5250
|
| 746 |
+
},
|
| 747 |
+
{
|
| 748 |
+
"epoch": 1.0202950819672132,
|
| 749 |
+
"grad_norm": 0.49609375,
|
| 750 |
+
"learning_rate": 0.00029921706832048784,
|
| 751 |
+
"loss": 2.3175,
|
| 752 |
+
"step": 5300
|
| 753 |
+
},
|
| 754 |
+
{
|
| 755 |
+
"epoch": 1.0208415300546447,
|
| 756 |
+
"grad_norm": 0.48828125,
|
| 757 |
+
"learning_rate": 0.00029918985301111985,
|
| 758 |
+
"loss": 2.2834,
|
| 759 |
+
"step": 5350
|
| 760 |
+
},
|
| 761 |
+
{
|
| 762 |
+
"epoch": 1.0213879781420765,
|
| 763 |
+
"grad_norm": 0.734375,
|
| 764 |
+
"learning_rate": 0.00029916217401624716,
|
| 765 |
+
"loss": 2.2522,
|
| 766 |
+
"step": 5400
|
| 767 |
+
},
|
| 768 |
+
{
|
| 769 |
+
"epoch": 1.0219344262295083,
|
| 770 |
+
"grad_norm": 0.46484375,
|
| 771 |
+
"learning_rate": 0.00029913403142189677,
|
| 772 |
+
"loss": 2.2872,
|
| 773 |
+
"step": 5450
|
| 774 |
+
},
|
| 775 |
+
{
|
| 776 |
+
"epoch": 1.0224808743169398,
|
| 777 |
+
"grad_norm": 0.51953125,
|
| 778 |
+
"learning_rate": 0.00029910542531553656,
|
| 779 |
+
"loss": 2.2793,
|
| 780 |
+
"step": 5500
|
| 781 |
+
},
|
| 782 |
+
{
|
| 783 |
+
"epoch": 1.0230273224043716,
|
| 784 |
+
"grad_norm": 0.474609375,
|
| 785 |
+
"learning_rate": 0.00029907635578607487,
|
| 786 |
+
"loss": 2.218,
|
| 787 |
+
"step": 5550
|
| 788 |
+
},
|
| 789 |
+
{
|
| 790 |
+
"epoch": 1.0235737704918033,
|
| 791 |
+
"grad_norm": 0.49609375,
|
| 792 |
+
"learning_rate": 0.00029904682292386053,
|
| 793 |
+
"loss": 2.2309,
|
| 794 |
+
"step": 5600
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 1.024120218579235,
|
| 798 |
+
"grad_norm": 0.52734375,
|
| 799 |
+
"learning_rate": 0.0002990168268206823,
|
| 800 |
+
"loss": 2.2285,
|
| 801 |
+
"step": 5650
|
| 802 |
+
},
|
| 803 |
+
{
|
| 804 |
+
"epoch": 1.0246666666666666,
|
| 805 |
+
"grad_norm": 0.48828125,
|
| 806 |
+
"learning_rate": 0.00029898636756976884,
|
| 807 |
+
"loss": 2.2338,
|
| 808 |
+
"step": 5700
|
| 809 |
+
},
|
| 810 |
+
{
|
| 811 |
+
"epoch": 1.0252131147540984,
|
| 812 |
+
"grad_norm": 0.462890625,
|
| 813 |
+
"learning_rate": 0.0002989554452657881,
|
| 814 |
+
"loss": 2.2048,
|
| 815 |
+
"step": 5750
|
| 816 |
+
},
|
| 817 |
+
{
|
| 818 |
+
"epoch": 1.0257595628415301,
|
| 819 |
+
"grad_norm": 0.62109375,
|
| 820 |
+
"learning_rate": 0.0002989240600048475,
|
| 821 |
+
"loss": 2.2716,
|
| 822 |
+
"step": 5800
|
| 823 |
+
},
|
| 824 |
+
{
|
| 825 |
+
"epoch": 1.0263060109289617,
|
| 826 |
+
"grad_norm": 0.6015625,
|
| 827 |
+
"learning_rate": 0.00029889221188449295,
|
| 828 |
+
"loss": 2.2618,
|
| 829 |
+
"step": 5850
|
| 830 |
+
},
|
| 831 |
+
{
|
| 832 |
+
"epoch": 1.0268524590163934,
|
| 833 |
+
"grad_norm": 0.47265625,
|
| 834 |
+
"learning_rate": 0.0002988599010037092,
|
| 835 |
+
"loss": 2.2181,
|
| 836 |
+
"step": 5900
|
| 837 |
+
},
|
| 838 |
+
{
|
| 839 |
+
"epoch": 1.0273989071038252,
|
| 840 |
+
"grad_norm": 0.5234375,
|
| 841 |
+
"learning_rate": 0.0002988271274629192,
|
| 842 |
+
"loss": 2.2005,
|
| 843 |
+
"step": 5950
|
| 844 |
+
},
|
| 845 |
+
{
|
| 846 |
+
"epoch": 1.0279453551912567,
|
| 847 |
+
"grad_norm": 0.515625,
|
| 848 |
+
"learning_rate": 0.00029879389136398403,
|
| 849 |
+
"loss": 2.1958,
|
| 850 |
+
"step": 6000
|
| 851 |
}
|
| 852 |
],
|
| 853 |
"logging_steps": 50,
|
|
|
|
| 867 |
"attributes": {}
|
| 868 |
}
|
| 869 |
},
|
| 870 |
+
"total_flos": 3.2086020985643336e+18,
|
| 871 |
"train_batch_size": 16,
|
| 872 |
"trial_name": null,
|
| 873 |
"trial_params": null
|