Upload 10 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +365 -5
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 20061432
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d0a1a2ebc0f1f1e6bae76f519dbcc21bb42eeb93020fab49ae955c26480b74e
|
3 |
size 20061432
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 40205626
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a5ea25c6472ccaf3b15d44dfb9cfe3f95c2d03ee19001eee198898ac6253a32
|
3 |
size 40205626
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c83fbbf7f760f16043b0c1585d6a2a676ee048a2decb272da8f2e2127ffb79b3
|
3 |
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d78a469aae0653b14dcccaea45eb52458c95afae1649e5d894a88bcf0a974d36
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
-
"best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-
|
4 |
-
"epoch":
|
5 |
"eval_steps": 90,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -9583,6 +9583,366 @@
|
|
9583 |
"eval_samples_per_second": 457.867,
|
9584 |
"eval_steps_per_second": 2.045,
|
9585 |
"step": 11970
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9586 |
}
|
9587 |
],
|
9588 |
"logging_steps": 10,
|
@@ -9590,7 +9950,7 @@
|
|
9590 |
"num_input_tokens_seen": 0,
|
9591 |
"num_train_epochs": 10,
|
9592 |
"save_steps": 90,
|
9593 |
-
"total_flos": 1.
|
9594 |
"train_batch_size": 192,
|
9595 |
"trial_name": null,
|
9596 |
"trial_params": null
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.6068978309631348,
|
3 |
+
"best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-12420",
|
4 |
+
"epoch": 9.049218813243746,
|
5 |
"eval_steps": 90,
|
6 |
+
"global_step": 12420,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
9583 |
"eval_samples_per_second": 457.867,
|
9584 |
"eval_steps_per_second": 2.045,
|
9585 |
"step": 11970
|
9586 |
+
},
|
9587 |
+
{
|
9588 |
+
"epoch": 8.73,
|
9589 |
+
"grad_norm": 0.289413720369339,
|
9590 |
+
"learning_rate": 5.1485148514851485e-05,
|
9591 |
+
"loss": 1.758,
|
9592 |
+
"step": 11980
|
9593 |
+
},
|
9594 |
+
{
|
9595 |
+
"epoch": 8.74,
|
9596 |
+
"grad_norm": 0.2739205062389374,
|
9597 |
+
"learning_rate": 5.0495049504950497e-05,
|
9598 |
+
"loss": 1.7579,
|
9599 |
+
"step": 11990
|
9600 |
+
},
|
9601 |
+
{
|
9602 |
+
"epoch": 8.74,
|
9603 |
+
"grad_norm": 0.26597511768341064,
|
9604 |
+
"learning_rate": 4.950495049504951e-05,
|
9605 |
+
"loss": 1.7568,
|
9606 |
+
"step": 12000
|
9607 |
+
},
|
9608 |
+
{
|
9609 |
+
"epoch": 8.75,
|
9610 |
+
"grad_norm": 0.24635004997253418,
|
9611 |
+
"learning_rate": 4.851485148514852e-05,
|
9612 |
+
"loss": 1.7584,
|
9613 |
+
"step": 12010
|
9614 |
+
},
|
9615 |
+
{
|
9616 |
+
"epoch": 8.76,
|
9617 |
+
"grad_norm": 0.2534136474132538,
|
9618 |
+
"learning_rate": 4.7524752475247525e-05,
|
9619 |
+
"loss": 1.7602,
|
9620 |
+
"step": 12020
|
9621 |
+
},
|
9622 |
+
{
|
9623 |
+
"epoch": 8.77,
|
9624 |
+
"grad_norm": 0.26007363200187683,
|
9625 |
+
"learning_rate": 4.653465346534654e-05,
|
9626 |
+
"loss": 1.7567,
|
9627 |
+
"step": 12030
|
9628 |
+
},
|
9629 |
+
{
|
9630 |
+
"epoch": 8.77,
|
9631 |
+
"grad_norm": 0.2807808816432953,
|
9632 |
+
"learning_rate": 4.554455445544554e-05,
|
9633 |
+
"loss": 1.7566,
|
9634 |
+
"step": 12040
|
9635 |
+
},
|
9636 |
+
{
|
9637 |
+
"epoch": 8.78,
|
9638 |
+
"grad_norm": 0.2677513360977173,
|
9639 |
+
"learning_rate": 4.455445544554455e-05,
|
9640 |
+
"loss": 1.7567,
|
9641 |
+
"step": 12050
|
9642 |
+
},
|
9643 |
+
{
|
9644 |
+
"epoch": 8.79,
|
9645 |
+
"grad_norm": 0.2691977620124817,
|
9646 |
+
"learning_rate": 4.3564356435643565e-05,
|
9647 |
+
"loss": 1.757,
|
9648 |
+
"step": 12060
|
9649 |
+
},
|
9650 |
+
{
|
9651 |
+
"epoch": 8.79,
|
9652 |
+
"eval_accuracy": 0.6521601327172856,
|
9653 |
+
"eval_loss": 1.60829758644104,
|
9654 |
+
"eval_runtime": 1089.928,
|
9655 |
+
"eval_samples_per_second": 458.177,
|
9656 |
+
"eval_steps_per_second": 2.046,
|
9657 |
+
"step": 12060
|
9658 |
+
},
|
9659 |
+
{
|
9660 |
+
"epoch": 8.79,
|
9661 |
+
"grad_norm": 0.2577356696128845,
|
9662 |
+
"learning_rate": 4.257425742574258e-05,
|
9663 |
+
"loss": 1.7584,
|
9664 |
+
"step": 12070
|
9665 |
+
},
|
9666 |
+
{
|
9667 |
+
"epoch": 8.8,
|
9668 |
+
"grad_norm": 0.2654874324798584,
|
9669 |
+
"learning_rate": 4.158415841584159e-05,
|
9670 |
+
"loss": 1.7571,
|
9671 |
+
"step": 12080
|
9672 |
+
},
|
9673 |
+
{
|
9674 |
+
"epoch": 8.81,
|
9675 |
+
"grad_norm": 0.25344353914260864,
|
9676 |
+
"learning_rate": 4.0594059405940594e-05,
|
9677 |
+
"loss": 1.7581,
|
9678 |
+
"step": 12090
|
9679 |
+
},
|
9680 |
+
{
|
9681 |
+
"epoch": 8.82,
|
9682 |
+
"grad_norm": 0.25865158438682556,
|
9683 |
+
"learning_rate": 3.9603960396039605e-05,
|
9684 |
+
"loss": 1.7552,
|
9685 |
+
"step": 12100
|
9686 |
+
},
|
9687 |
+
{
|
9688 |
+
"epoch": 8.82,
|
9689 |
+
"grad_norm": 0.28875982761383057,
|
9690 |
+
"learning_rate": 3.861386138613862e-05,
|
9691 |
+
"loss": 1.757,
|
9692 |
+
"step": 12110
|
9693 |
+
},
|
9694 |
+
{
|
9695 |
+
"epoch": 8.83,
|
9696 |
+
"grad_norm": 0.2697414755821228,
|
9697 |
+
"learning_rate": 3.762376237623762e-05,
|
9698 |
+
"loss": 1.7579,
|
9699 |
+
"step": 12120
|
9700 |
+
},
|
9701 |
+
{
|
9702 |
+
"epoch": 8.84,
|
9703 |
+
"grad_norm": 0.2786589562892914,
|
9704 |
+
"learning_rate": 3.6633663366336634e-05,
|
9705 |
+
"loss": 1.7583,
|
9706 |
+
"step": 12130
|
9707 |
+
},
|
9708 |
+
{
|
9709 |
+
"epoch": 8.85,
|
9710 |
+
"grad_norm": 0.258486270904541,
|
9711 |
+
"learning_rate": 3.564356435643564e-05,
|
9712 |
+
"loss": 1.7581,
|
9713 |
+
"step": 12140
|
9714 |
+
},
|
9715 |
+
{
|
9716 |
+
"epoch": 8.85,
|
9717 |
+
"grad_norm": 0.2595365345478058,
|
9718 |
+
"learning_rate": 3.465346534653466e-05,
|
9719 |
+
"loss": 1.757,
|
9720 |
+
"step": 12150
|
9721 |
+
},
|
9722 |
+
{
|
9723 |
+
"epoch": 8.85,
|
9724 |
+
"eval_accuracy": 0.652040482066107,
|
9725 |
+
"eval_loss": 1.6086018085479736,
|
9726 |
+
"eval_runtime": 1089.5635,
|
9727 |
+
"eval_samples_per_second": 458.33,
|
9728 |
+
"eval_steps_per_second": 2.047,
|
9729 |
+
"step": 12150
|
9730 |
+
},
|
9731 |
+
{
|
9732 |
+
"epoch": 8.86,
|
9733 |
+
"grad_norm": 0.25674012303352356,
|
9734 |
+
"learning_rate": 3.366336633663367e-05,
|
9735 |
+
"loss": 1.7595,
|
9736 |
+
"step": 12160
|
9737 |
+
},
|
9738 |
+
{
|
9739 |
+
"epoch": 8.87,
|
9740 |
+
"grad_norm": 0.23194921016693115,
|
9741 |
+
"learning_rate": 3.2673267326732674e-05,
|
9742 |
+
"loss": 1.7574,
|
9743 |
+
"step": 12170
|
9744 |
+
},
|
9745 |
+
{
|
9746 |
+
"epoch": 8.87,
|
9747 |
+
"grad_norm": 0.2626875936985016,
|
9748 |
+
"learning_rate": 3.1683168316831686e-05,
|
9749 |
+
"loss": 1.7571,
|
9750 |
+
"step": 12180
|
9751 |
+
},
|
9752 |
+
{
|
9753 |
+
"epoch": 8.88,
|
9754 |
+
"grad_norm": 0.2361476868391037,
|
9755 |
+
"learning_rate": 3.069306930693069e-05,
|
9756 |
+
"loss": 1.7573,
|
9757 |
+
"step": 12190
|
9758 |
+
},
|
9759 |
+
{
|
9760 |
+
"epoch": 8.89,
|
9761 |
+
"grad_norm": 0.2606755793094635,
|
9762 |
+
"learning_rate": 2.9702970297029702e-05,
|
9763 |
+
"loss": 1.7567,
|
9764 |
+
"step": 12200
|
9765 |
+
},
|
9766 |
+
{
|
9767 |
+
"epoch": 8.9,
|
9768 |
+
"grad_norm": 0.27499887347221375,
|
9769 |
+
"learning_rate": 2.8712871287128714e-05,
|
9770 |
+
"loss": 1.7579,
|
9771 |
+
"step": 12210
|
9772 |
+
},
|
9773 |
+
{
|
9774 |
+
"epoch": 8.9,
|
9775 |
+
"grad_norm": 0.24832656979560852,
|
9776 |
+
"learning_rate": 2.7722772277227722e-05,
|
9777 |
+
"loss": 1.7566,
|
9778 |
+
"step": 12220
|
9779 |
+
},
|
9780 |
+
{
|
9781 |
+
"epoch": 8.91,
|
9782 |
+
"grad_norm": 0.24898388981819153,
|
9783 |
+
"learning_rate": 2.6732673267326734e-05,
|
9784 |
+
"loss": 1.7544,
|
9785 |
+
"step": 12230
|
9786 |
+
},
|
9787 |
+
{
|
9788 |
+
"epoch": 8.92,
|
9789 |
+
"grad_norm": 0.24266423285007477,
|
9790 |
+
"learning_rate": 2.5742574257425742e-05,
|
9791 |
+
"loss": 1.7559,
|
9792 |
+
"step": 12240
|
9793 |
+
},
|
9794 |
+
{
|
9795 |
+
"epoch": 8.92,
|
9796 |
+
"eval_accuracy": 0.6522573824099933,
|
9797 |
+
"eval_loss": 1.6079708337783813,
|
9798 |
+
"eval_runtime": 1089.9176,
|
9799 |
+
"eval_samples_per_second": 458.181,
|
9800 |
+
"eval_steps_per_second": 2.046,
|
9801 |
+
"step": 12240
|
9802 |
+
},
|
9803 |
+
{
|
9804 |
+
"epoch": 8.93,
|
9805 |
+
"grad_norm": 0.2438860386610031,
|
9806 |
+
"learning_rate": 2.4752475247524754e-05,
|
9807 |
+
"loss": 1.7554,
|
9808 |
+
"step": 12250
|
9809 |
+
},
|
9810 |
+
{
|
9811 |
+
"epoch": 8.93,
|
9812 |
+
"grad_norm": 0.22911418974399567,
|
9813 |
+
"learning_rate": 2.3762376237623762e-05,
|
9814 |
+
"loss": 1.7547,
|
9815 |
+
"step": 12260
|
9816 |
+
},
|
9817 |
+
{
|
9818 |
+
"epoch": 8.94,
|
9819 |
+
"grad_norm": 0.2550877034664154,
|
9820 |
+
"learning_rate": 2.277227722772277e-05,
|
9821 |
+
"loss": 1.7567,
|
9822 |
+
"step": 12270
|
9823 |
+
},
|
9824 |
+
{
|
9825 |
+
"epoch": 8.95,
|
9826 |
+
"grad_norm": 0.2409505546092987,
|
9827 |
+
"learning_rate": 2.1782178217821783e-05,
|
9828 |
+
"loss": 1.7556,
|
9829 |
+
"step": 12280
|
9830 |
+
},
|
9831 |
+
{
|
9832 |
+
"epoch": 8.95,
|
9833 |
+
"grad_norm": 0.23632997274398804,
|
9834 |
+
"learning_rate": 2.0792079207920794e-05,
|
9835 |
+
"loss": 1.7573,
|
9836 |
+
"step": 12290
|
9837 |
+
},
|
9838 |
+
{
|
9839 |
+
"epoch": 8.96,
|
9840 |
+
"grad_norm": 0.22292740643024445,
|
9841 |
+
"learning_rate": 1.9801980198019803e-05,
|
9842 |
+
"loss": 1.757,
|
9843 |
+
"step": 12300
|
9844 |
+
},
|
9845 |
+
{
|
9846 |
+
"epoch": 8.97,
|
9847 |
+
"grad_norm": 0.2350420504808426,
|
9848 |
+
"learning_rate": 1.881188118811881e-05,
|
9849 |
+
"loss": 1.756,
|
9850 |
+
"step": 12310
|
9851 |
+
},
|
9852 |
+
{
|
9853 |
+
"epoch": 8.98,
|
9854 |
+
"grad_norm": 0.22938278317451477,
|
9855 |
+
"learning_rate": 1.782178217821782e-05,
|
9856 |
+
"loss": 1.7562,
|
9857 |
+
"step": 12320
|
9858 |
+
},
|
9859 |
+
{
|
9860 |
+
"epoch": 8.98,
|
9861 |
+
"grad_norm": 0.2246268391609192,
|
9862 |
+
"learning_rate": 1.6831683168316834e-05,
|
9863 |
+
"loss": 1.7556,
|
9864 |
+
"step": 12330
|
9865 |
+
},
|
9866 |
+
{
|
9867 |
+
"epoch": 8.98,
|
9868 |
+
"eval_accuracy": 0.652376308176148,
|
9869 |
+
"eval_loss": 1.6073620319366455,
|
9870 |
+
"eval_runtime": 1088.9818,
|
9871 |
+
"eval_samples_per_second": 458.575,
|
9872 |
+
"eval_steps_per_second": 2.048,
|
9873 |
+
"step": 12330
|
9874 |
+
},
|
9875 |
+
{
|
9876 |
+
"epoch": 8.99,
|
9877 |
+
"grad_norm": 0.22820483148097992,
|
9878 |
+
"learning_rate": 1.5841584158415843e-05,
|
9879 |
+
"loss": 1.7564,
|
9880 |
+
"step": 12340
|
9881 |
+
},
|
9882 |
+
{
|
9883 |
+
"epoch": 9.0,
|
9884 |
+
"grad_norm": 0.2315167486667633,
|
9885 |
+
"learning_rate": 1.4851485148514851e-05,
|
9886 |
+
"loss": 1.7558,
|
9887 |
+
"step": 12350
|
9888 |
+
},
|
9889 |
+
{
|
9890 |
+
"epoch": 9.01,
|
9891 |
+
"grad_norm": 0.21513350307941437,
|
9892 |
+
"learning_rate": 1.3861386138613861e-05,
|
9893 |
+
"loss": 1.757,
|
9894 |
+
"step": 12360
|
9895 |
+
},
|
9896 |
+
{
|
9897 |
+
"epoch": 9.01,
|
9898 |
+
"grad_norm": 0.21538245677947998,
|
9899 |
+
"learning_rate": 1.2871287128712871e-05,
|
9900 |
+
"loss": 1.7527,
|
9901 |
+
"step": 12370
|
9902 |
+
},
|
9903 |
+
{
|
9904 |
+
"epoch": 9.02,
|
9905 |
+
"grad_norm": 0.22796376049518585,
|
9906 |
+
"learning_rate": 1.1881188118811881e-05,
|
9907 |
+
"loss": 1.7549,
|
9908 |
+
"step": 12380
|
9909 |
+
},
|
9910 |
+
{
|
9911 |
+
"epoch": 9.03,
|
9912 |
+
"grad_norm": 0.21846508979797363,
|
9913 |
+
"learning_rate": 1.0891089108910891e-05,
|
9914 |
+
"loss": 1.7527,
|
9915 |
+
"step": 12390
|
9916 |
+
},
|
9917 |
+
{
|
9918 |
+
"epoch": 9.03,
|
9919 |
+
"grad_norm": 0.2252340316772461,
|
9920 |
+
"learning_rate": 9.900990099009901e-06,
|
9921 |
+
"loss": 1.757,
|
9922 |
+
"step": 12400
|
9923 |
+
},
|
9924 |
+
{
|
9925 |
+
"epoch": 9.04,
|
9926 |
+
"grad_norm": 0.22679966688156128,
|
9927 |
+
"learning_rate": 8.91089108910891e-06,
|
9928 |
+
"loss": 1.7547,
|
9929 |
+
"step": 12410
|
9930 |
+
},
|
9931 |
+
{
|
9932 |
+
"epoch": 9.05,
|
9933 |
+
"grad_norm": 0.21749068796634674,
|
9934 |
+
"learning_rate": 7.920792079207921e-06,
|
9935 |
+
"loss": 1.755,
|
9936 |
+
"step": 12420
|
9937 |
+
},
|
9938 |
+
{
|
9939 |
+
"epoch": 9.05,
|
9940 |
+
"eval_accuracy": 0.6525192559694988,
|
9941 |
+
"eval_loss": 1.6068978309631348,
|
9942 |
+
"eval_runtime": 1087.147,
|
9943 |
+
"eval_samples_per_second": 459.349,
|
9944 |
+
"eval_steps_per_second": 2.051,
|
9945 |
+
"step": 12420
|
9946 |
}
|
9947 |
],
|
9948 |
"logging_steps": 10,
|
|
|
9950 |
"num_input_tokens_seen": 0,
|
9951 |
"num_train_epochs": 10,
|
9952 |
"save_steps": 90,
|
9953 |
+
"total_flos": 1.28938481325833e+18,
|
9954 |
"train_batch_size": 192,
|
9955 |
"trial_name": null,
|
9956 |
"trial_params": null
|