Training in progress, step 2900, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1370666272
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85e10776ed7d2feec702f85a92294fc572495be458fad36bc37e21242039a14d
|
3 |
size 1370666272
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 697294462
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f328674c88f0186255ee5dbf4ac7f148eb4bef19de18a361c1ed0eb9ce9660bb
|
3 |
size 697294462
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:605d9f0439096f21199e65a6f7490d22d8285df735f81d56920505482985be35
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -19957,6 +19957,356 @@
|
|
19957 |
"learning_rate": 0.00019150812636190874,
|
19958 |
"loss": 0.8451,
|
19959 |
"step": 2850
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19960 |
}
|
19961 |
],
|
19962 |
"logging_steps": 1,
|
@@ -19976,7 +20326,7 @@
|
|
19976 |
"attributes": {}
|
19977 |
}
|
19978 |
},
|
19979 |
-
"total_flos": 1.
|
19980 |
"train_batch_size": 32,
|
19981 |
"trial_name": null,
|
19982 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.6730880816989672,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 2900,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
19957 |
"learning_rate": 0.00019150812636190874,
|
19958 |
"loss": 0.8451,
|
19959 |
"step": 2850
|
19960 |
+
},
|
19961 |
+
{
|
19962 |
+
"epoch": 0.6617152141116398,
|
19963 |
+
"grad_norm": 0.45293116569519043,
|
19964 |
+
"learning_rate": 0.00019150224373375174,
|
19965 |
+
"loss": 0.9152,
|
19966 |
+
"step": 2851
|
19967 |
+
},
|
19968 |
+
{
|
19969 |
+
"epoch": 0.6619473134501567,
|
19970 |
+
"grad_norm": 0.49578094482421875,
|
19971 |
+
"learning_rate": 0.00019149635915915889,
|
19972 |
+
"loss": 0.8429,
|
19973 |
+
"step": 2852
|
19974 |
+
},
|
19975 |
+
{
|
19976 |
+
"epoch": 0.6621794127886735,
|
19977 |
+
"grad_norm": 0.45070314407348633,
|
19978 |
+
"learning_rate": 0.00019149047263825538,
|
19979 |
+
"loss": 0.829,
|
19980 |
+
"step": 2853
|
19981 |
+
},
|
19982 |
+
{
|
19983 |
+
"epoch": 0.6624115121271904,
|
19984 |
+
"grad_norm": 0.44752323627471924,
|
19985 |
+
"learning_rate": 0.00019148458417116645,
|
19986 |
+
"loss": 0.874,
|
19987 |
+
"step": 2854
|
19988 |
+
},
|
19989 |
+
{
|
19990 |
+
"epoch": 0.6626436114657073,
|
19991 |
+
"grad_norm": 0.4903758466243744,
|
19992 |
+
"learning_rate": 0.00019147869375801734,
|
19993 |
+
"loss": 0.8787,
|
19994 |
+
"step": 2855
|
19995 |
+
},
|
19996 |
+
{
|
19997 |
+
"epoch": 0.6628757108042242,
|
19998 |
+
"grad_norm": 0.43119940161705017,
|
19999 |
+
"learning_rate": 0.00019147280139893337,
|
20000 |
+
"loss": 0.8978,
|
20001 |
+
"step": 2856
|
20002 |
+
},
|
20003 |
+
{
|
20004 |
+
"epoch": 0.6631078101427411,
|
20005 |
+
"grad_norm": 0.5306719541549683,
|
20006 |
+
"learning_rate": 0.00019146690709403988,
|
20007 |
+
"loss": 0.9067,
|
20008 |
+
"step": 2857
|
20009 |
+
},
|
20010 |
+
{
|
20011 |
+
"epoch": 0.663339909481258,
|
20012 |
+
"grad_norm": 0.45615947246551514,
|
20013 |
+
"learning_rate": 0.0001914610108434622,
|
20014 |
+
"loss": 0.839,
|
20015 |
+
"step": 2858
|
20016 |
+
},
|
20017 |
+
{
|
20018 |
+
"epoch": 0.6635720088197749,
|
20019 |
+
"grad_norm": 0.4449672996997833,
|
20020 |
+
"learning_rate": 0.00019145511264732584,
|
20021 |
+
"loss": 0.8675,
|
20022 |
+
"step": 2859
|
20023 |
+
},
|
20024 |
+
{
|
20025 |
+
"epoch": 0.6638041081582917,
|
20026 |
+
"grad_norm": 0.4791627526283264,
|
20027 |
+
"learning_rate": 0.00019144921250575619,
|
20028 |
+
"loss": 0.8853,
|
20029 |
+
"step": 2860
|
20030 |
+
},
|
20031 |
+
{
|
20032 |
+
"epoch": 0.6640362074968086,
|
20033 |
+
"grad_norm": 0.4788571298122406,
|
20034 |
+
"learning_rate": 0.00019144331041887882,
|
20035 |
+
"loss": 0.9273,
|
20036 |
+
"step": 2861
|
20037 |
+
},
|
20038 |
+
{
|
20039 |
+
"epoch": 0.6642683068353256,
|
20040 |
+
"grad_norm": 0.42612382769584656,
|
20041 |
+
"learning_rate": 0.00019143740638681922,
|
20042 |
+
"loss": 0.893,
|
20043 |
+
"step": 2862
|
20044 |
+
},
|
20045 |
+
{
|
20046 |
+
"epoch": 0.6645004061738424,
|
20047 |
+
"grad_norm": 0.47250261902809143,
|
20048 |
+
"learning_rate": 0.000191431500409703,
|
20049 |
+
"loss": 0.8795,
|
20050 |
+
"step": 2863
|
20051 |
+
},
|
20052 |
+
{
|
20053 |
+
"epoch": 0.6647325055123593,
|
20054 |
+
"grad_norm": 0.4971529543399811,
|
20055 |
+
"learning_rate": 0.00019142559248765587,
|
20056 |
+
"loss": 0.9454,
|
20057 |
+
"step": 2864
|
20058 |
+
},
|
20059 |
+
{
|
20060 |
+
"epoch": 0.6649646048508762,
|
20061 |
+
"grad_norm": 0.5304151177406311,
|
20062 |
+
"learning_rate": 0.00019141968262080335,
|
20063 |
+
"loss": 0.9761,
|
20064 |
+
"step": 2865
|
20065 |
+
},
|
20066 |
+
{
|
20067 |
+
"epoch": 0.665196704189393,
|
20068 |
+
"grad_norm": 0.4551432728767395,
|
20069 |
+
"learning_rate": 0.00019141377080927132,
|
20070 |
+
"loss": 0.8263,
|
20071 |
+
"step": 2866
|
20072 |
+
},
|
20073 |
+
{
|
20074 |
+
"epoch": 0.6654288035279099,
|
20075 |
+
"grad_norm": 0.4839153289794922,
|
20076 |
+
"learning_rate": 0.0001914078570531854,
|
20077 |
+
"loss": 0.8516,
|
20078 |
+
"step": 2867
|
20079 |
+
},
|
20080 |
+
{
|
20081 |
+
"epoch": 0.6656609028664269,
|
20082 |
+
"grad_norm": 0.501598060131073,
|
20083 |
+
"learning_rate": 0.0001914019413526715,
|
20084 |
+
"loss": 0.8753,
|
20085 |
+
"step": 2868
|
20086 |
+
},
|
20087 |
+
{
|
20088 |
+
"epoch": 0.6658930022049437,
|
20089 |
+
"grad_norm": 0.49526655673980713,
|
20090 |
+
"learning_rate": 0.00019139602370785538,
|
20091 |
+
"loss": 0.8342,
|
20092 |
+
"step": 2869
|
20093 |
+
},
|
20094 |
+
{
|
20095 |
+
"epoch": 0.6661251015434606,
|
20096 |
+
"grad_norm": 0.49142616987228394,
|
20097 |
+
"learning_rate": 0.00019139010411886291,
|
20098 |
+
"loss": 0.8389,
|
20099 |
+
"step": 2870
|
20100 |
+
},
|
20101 |
+
{
|
20102 |
+
"epoch": 0.6663572008819775,
|
20103 |
+
"grad_norm": 0.5267114639282227,
|
20104 |
+
"learning_rate": 0.00019138418258582006,
|
20105 |
+
"loss": 0.8339,
|
20106 |
+
"step": 2871
|
20107 |
+
},
|
20108 |
+
{
|
20109 |
+
"epoch": 0.6665893002204943,
|
20110 |
+
"grad_norm": 0.42393583059310913,
|
20111 |
+
"learning_rate": 0.0001913782591088528,
|
20112 |
+
"loss": 0.8576,
|
20113 |
+
"step": 2872
|
20114 |
+
},
|
20115 |
+
{
|
20116 |
+
"epoch": 0.6668213995590112,
|
20117 |
+
"grad_norm": 0.4962637722492218,
|
20118 |
+
"learning_rate": 0.0001913723336880871,
|
20119 |
+
"loss": 0.837,
|
20120 |
+
"step": 2873
|
20121 |
+
},
|
20122 |
+
{
|
20123 |
+
"epoch": 0.6670534988975282,
|
20124 |
+
"grad_norm": 0.4471946656703949,
|
20125 |
+
"learning_rate": 0.000191366406323649,
|
20126 |
+
"loss": 0.8259,
|
20127 |
+
"step": 2874
|
20128 |
+
},
|
20129 |
+
{
|
20130 |
+
"epoch": 0.667285598236045,
|
20131 |
+
"grad_norm": 0.48034703731536865,
|
20132 |
+
"learning_rate": 0.00019136047701566464,
|
20133 |
+
"loss": 0.8537,
|
20134 |
+
"step": 2875
|
20135 |
+
},
|
20136 |
+
{
|
20137 |
+
"epoch": 0.6675176975745619,
|
20138 |
+
"grad_norm": 0.47116121649742126,
|
20139 |
+
"learning_rate": 0.0001913545457642601,
|
20140 |
+
"loss": 0.8252,
|
20141 |
+
"step": 2876
|
20142 |
+
},
|
20143 |
+
{
|
20144 |
+
"epoch": 0.6677497969130788,
|
20145 |
+
"grad_norm": 0.5071761012077332,
|
20146 |
+
"learning_rate": 0.00019134861256956155,
|
20147 |
+
"loss": 0.898,
|
20148 |
+
"step": 2877
|
20149 |
+
},
|
20150 |
+
{
|
20151 |
+
"epoch": 0.6679818962515957,
|
20152 |
+
"grad_norm": 0.4993492662906647,
|
20153 |
+
"learning_rate": 0.00019134267743169524,
|
20154 |
+
"loss": 0.8555,
|
20155 |
+
"step": 2878
|
20156 |
+
},
|
20157 |
+
{
|
20158 |
+
"epoch": 0.6682139955901125,
|
20159 |
+
"grad_norm": 0.5150817036628723,
|
20160 |
+
"learning_rate": 0.00019133674035078736,
|
20161 |
+
"loss": 0.8624,
|
20162 |
+
"step": 2879
|
20163 |
+
},
|
20164 |
+
{
|
20165 |
+
"epoch": 0.6684460949286295,
|
20166 |
+
"grad_norm": 0.5153425931930542,
|
20167 |
+
"learning_rate": 0.00019133080132696426,
|
20168 |
+
"loss": 0.8093,
|
20169 |
+
"step": 2880
|
20170 |
+
},
|
20171 |
+
{
|
20172 |
+
"epoch": 0.6686781942671464,
|
20173 |
+
"grad_norm": 0.4248557686805725,
|
20174 |
+
"learning_rate": 0.00019132486036035226,
|
20175 |
+
"loss": 0.8488,
|
20176 |
+
"step": 2881
|
20177 |
+
},
|
20178 |
+
{
|
20179 |
+
"epoch": 0.6689102936056632,
|
20180 |
+
"grad_norm": 0.4647797644138336,
|
20181 |
+
"learning_rate": 0.0001913189174510777,
|
20182 |
+
"loss": 0.9239,
|
20183 |
+
"step": 2882
|
20184 |
+
},
|
20185 |
+
{
|
20186 |
+
"epoch": 0.6691423929441801,
|
20187 |
+
"grad_norm": 0.5158550143241882,
|
20188 |
+
"learning_rate": 0.00019131297259926706,
|
20189 |
+
"loss": 0.8746,
|
20190 |
+
"step": 2883
|
20191 |
+
},
|
20192 |
+
{
|
20193 |
+
"epoch": 0.669374492282697,
|
20194 |
+
"grad_norm": 0.4511086344718933,
|
20195 |
+
"learning_rate": 0.00019130702580504676,
|
20196 |
+
"loss": 0.897,
|
20197 |
+
"step": 2884
|
20198 |
+
},
|
20199 |
+
{
|
20200 |
+
"epoch": 0.6696065916212138,
|
20201 |
+
"grad_norm": 0.5059782862663269,
|
20202 |
+
"learning_rate": 0.0001913010770685433,
|
20203 |
+
"loss": 0.8666,
|
20204 |
+
"step": 2885
|
20205 |
+
},
|
20206 |
+
{
|
20207 |
+
"epoch": 0.6698386909597308,
|
20208 |
+
"grad_norm": 0.4928185045719147,
|
20209 |
+
"learning_rate": 0.00019129512638988322,
|
20210 |
+
"loss": 0.842,
|
20211 |
+
"step": 2886
|
20212 |
+
},
|
20213 |
+
{
|
20214 |
+
"epoch": 0.6700707902982477,
|
20215 |
+
"grad_norm": 0.5002438426017761,
|
20216 |
+
"learning_rate": 0.00019128917376919313,
|
20217 |
+
"loss": 0.9076,
|
20218 |
+
"step": 2887
|
20219 |
+
},
|
20220 |
+
{
|
20221 |
+
"epoch": 0.6703028896367645,
|
20222 |
+
"grad_norm": 0.427513986825943,
|
20223 |
+
"learning_rate": 0.0001912832192065996,
|
20224 |
+
"loss": 0.8238,
|
20225 |
+
"step": 2888
|
20226 |
+
},
|
20227 |
+
{
|
20228 |
+
"epoch": 0.6705349889752814,
|
20229 |
+
"grad_norm": 0.45401087403297424,
|
20230 |
+
"learning_rate": 0.0001912772627022294,
|
20231 |
+
"loss": 0.8605,
|
20232 |
+
"step": 2889
|
20233 |
+
},
|
20234 |
+
{
|
20235 |
+
"epoch": 0.6707670883137983,
|
20236 |
+
"grad_norm": 0.43657442927360535,
|
20237 |
+
"learning_rate": 0.0001912713042562091,
|
20238 |
+
"loss": 0.8506,
|
20239 |
+
"step": 2890
|
20240 |
+
},
|
20241 |
+
{
|
20242 |
+
"epoch": 0.6709991876523151,
|
20243 |
+
"grad_norm": 0.41969212889671326,
|
20244 |
+
"learning_rate": 0.00019126534386866556,
|
20245 |
+
"loss": 0.8791,
|
20246 |
+
"step": 2891
|
20247 |
+
},
|
20248 |
+
{
|
20249 |
+
"epoch": 0.6712312869908321,
|
20250 |
+
"grad_norm": 0.46783447265625,
|
20251 |
+
"learning_rate": 0.00019125938153972548,
|
20252 |
+
"loss": 0.8774,
|
20253 |
+
"step": 2892
|
20254 |
+
},
|
20255 |
+
{
|
20256 |
+
"epoch": 0.671463386329349,
|
20257 |
+
"grad_norm": 0.44763606786727905,
|
20258 |
+
"learning_rate": 0.00019125341726951577,
|
20259 |
+
"loss": 0.9214,
|
20260 |
+
"step": 2893
|
20261 |
+
},
|
20262 |
+
{
|
20263 |
+
"epoch": 0.6716954856678659,
|
20264 |
+
"grad_norm": 0.46709761023521423,
|
20265 |
+
"learning_rate": 0.00019124745105816325,
|
20266 |
+
"loss": 0.8276,
|
20267 |
+
"step": 2894
|
20268 |
+
},
|
20269 |
+
{
|
20270 |
+
"epoch": 0.6719275850063827,
|
20271 |
+
"grad_norm": 0.471754252910614,
|
20272 |
+
"learning_rate": 0.0001912414829057949,
|
20273 |
+
"loss": 0.8645,
|
20274 |
+
"step": 2895
|
20275 |
+
},
|
20276 |
+
{
|
20277 |
+
"epoch": 0.6721596843448996,
|
20278 |
+
"grad_norm": 0.4268680810928345,
|
20279 |
+
"learning_rate": 0.00019123551281253757,
|
20280 |
+
"loss": 0.8376,
|
20281 |
+
"step": 2896
|
20282 |
+
},
|
20283 |
+
{
|
20284 |
+
"epoch": 0.6723917836834165,
|
20285 |
+
"grad_norm": 0.4184396266937256,
|
20286 |
+
"learning_rate": 0.00019122954077851833,
|
20287 |
+
"loss": 0.87,
|
20288 |
+
"step": 2897
|
20289 |
+
},
|
20290 |
+
{
|
20291 |
+
"epoch": 0.6726238830219334,
|
20292 |
+
"grad_norm": 0.48813703656196594,
|
20293 |
+
"learning_rate": 0.0001912235668038642,
|
20294 |
+
"loss": 0.866,
|
20295 |
+
"step": 2898
|
20296 |
+
},
|
20297 |
+
{
|
20298 |
+
"epoch": 0.6728559823604503,
|
20299 |
+
"grad_norm": 0.4599473774433136,
|
20300 |
+
"learning_rate": 0.00019121759088870226,
|
20301 |
+
"loss": 0.834,
|
20302 |
+
"step": 2899
|
20303 |
+
},
|
20304 |
+
{
|
20305 |
+
"epoch": 0.6730880816989672,
|
20306 |
+
"grad_norm": 0.4024162292480469,
|
20307 |
+
"learning_rate": 0.00019121161303315963,
|
20308 |
+
"loss": 0.8731,
|
20309 |
+
"step": 2900
|
20310 |
}
|
20311 |
],
|
20312 |
"logging_steps": 1,
|
|
|
20326 |
"attributes": {}
|
20327 |
}
|
20328 |
},
|
20329 |
+
"total_flos": 1.2872424773124096e+18,
|
20330 |
"train_batch_size": 32,
|
20331 |
"trial_name": null,
|
20332 |
"trial_params": null
|