|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9700214132762313, |
|
"eval_steps": 500, |
|
"global_step": 310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.15481117367744446, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1213, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.16627414524555206, |
|
"learning_rate": 4e-05, |
|
"loss": 1.1341, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.14720524847507477, |
|
"learning_rate": 6e-05, |
|
"loss": 1.148, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.15325099229812622, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1435, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.16704852879047394, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0895, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.11686641722917557, |
|
"learning_rate": 0.00012, |
|
"loss": 1.0784, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.09641632437705994, |
|
"learning_rate": 0.00014, |
|
"loss": 1.0612, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.12384118884801865, |
|
"learning_rate": 0.00016, |
|
"loss": 1.0566, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.07287071645259857, |
|
"learning_rate": 0.00018, |
|
"loss": 1.0442, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.07469318807125092, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0083, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.08364757895469666, |
|
"learning_rate": 0.00019999761633493753, |
|
"loss": 1.0169, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.07763934135437012, |
|
"learning_rate": 0.0001999904654533872, |
|
"loss": 1.0348, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.06744416803121567, |
|
"learning_rate": 0.0001999785476962552, |
|
"loss": 1.0123, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.07724820077419281, |
|
"learning_rate": 0.00019996186363170035, |
|
"loss": 1.0188, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.0746370479464531, |
|
"learning_rate": 0.00019994041405510705, |
|
"loss": 1.0164, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.055804282426834106, |
|
"learning_rate": 0.00019991419998904747, |
|
"loss": 1.0587, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.054163169115781784, |
|
"learning_rate": 0.00019988322268323268, |
|
"loss": 1.0149, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05897677689790726, |
|
"learning_rate": 0.00019984748361445308, |
|
"loss": 1.0136, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05603804066777229, |
|
"learning_rate": 0.00019980698448650804, |
|
"loss": 0.9996, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.06250110268592834, |
|
"learning_rate": 0.0001997617272301248, |
|
"loss": 1.0145, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.05678323656320572, |
|
"learning_rate": 0.000199711714002866, |
|
"loss": 1.0005, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.05278163403272629, |
|
"learning_rate": 0.00019965694718902745, |
|
"loss": 1.0034, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05601625144481659, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 0.9915, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.054547134786844254, |
|
"learning_rate": 0.00019953316347176488, |
|
"loss": 1.0115, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.06417939066886902, |
|
"learning_rate": 0.0001994641524695193, |
|
"loss": 0.9862, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.061326365917921066, |
|
"learning_rate": 0.0001993903996827694, |
|
"loss": 0.9889, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.05376205965876579, |
|
"learning_rate": 0.00019931190862755417, |
|
"loss": 0.9604, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.0678999274969101, |
|
"learning_rate": 0.00019922868304580118, |
|
"loss": 1.0492, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.053755760192871094, |
|
"learning_rate": 0.0001991407269051487, |
|
"loss": 0.9985, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.05401955544948578, |
|
"learning_rate": 0.00019904804439875633, |
|
"loss": 0.9787, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.05666874349117279, |
|
"learning_rate": 0.0001989506399451051, |
|
"loss": 0.9886, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.05425805598497391, |
|
"learning_rate": 0.00019884851818778693, |
|
"loss": 1.0197, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.06086369976401329, |
|
"learning_rate": 0.00019874168399528305, |
|
"loss": 0.9879, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05577366426587105, |
|
"learning_rate": 0.00019863014246073214, |
|
"loss": 0.9808, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05853752791881561, |
|
"learning_rate": 0.0001985138989016874, |
|
"loss": 0.957, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.0582735501229763, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 0.9732, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.07255622744560242, |
|
"learning_rate": 0.00019826732810086998, |
|
"loss": 1.0199, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.06085599586367607, |
|
"learning_rate": 0.00019813701261394136, |
|
"loss": 0.9946, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06600817292928696, |
|
"learning_rate": 0.00019800201861164664, |
|
"loss": 0.9646, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.0634196326136589, |
|
"learning_rate": 0.00019786235252959553, |
|
"loss": 1.0092, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.05254920944571495, |
|
"learning_rate": 0.00019771802102613127, |
|
"loss": 0.9535, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.06008182466030121, |
|
"learning_rate": 0.00019756903098201308, |
|
"loss": 0.9897, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07715290039777756, |
|
"learning_rate": 0.00019741538950008818, |
|
"loss": 1.0132, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06104297190904617, |
|
"learning_rate": 0.0001972571039049533, |
|
"loss": 0.9938, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.06008582562208176, |
|
"learning_rate": 0.0001970941817426052, |
|
"loss": 0.9889, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05699775367975235, |
|
"learning_rate": 0.00019692663078008132, |
|
"loss": 0.9843, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05760645866394043, |
|
"learning_rate": 0.00019675445900508909, |
|
"loss": 0.9677, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.06075143814086914, |
|
"learning_rate": 0.00019657767462562544, |
|
"loss": 0.9929, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.059820640832185745, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 0.9889, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.059315863996744156, |
|
"learning_rate": 0.00019621030198436006, |
|
"loss": 0.9994, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.05949851870536804, |
|
"learning_rate": 0.00019601973123642492, |
|
"loss": 0.9593, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06036762520670891, |
|
"learning_rate": 0.00019582458291091663, |
|
"loss": 0.9669, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.05799931660294533, |
|
"learning_rate": 0.00019562486631120006, |
|
"loss": 0.9731, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.05733400583267212, |
|
"learning_rate": 0.00019542059095842485, |
|
"loss": 0.9676, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.06894834339618683, |
|
"learning_rate": 0.00019521176659107142, |
|
"loss": 1.0142, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0657154992222786, |
|
"learning_rate": 0.00019499840316448673, |
|
"loss": 0.9598, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0570225715637207, |
|
"learning_rate": 0.00019478051085040975, |
|
"loss": 0.9979, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.06271913647651672, |
|
"learning_rate": 0.00019455810003648637, |
|
"loss": 0.9694, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.05831892415881157, |
|
"learning_rate": 0.0001943311813257743, |
|
"loss": 0.9934, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.061948519200086594, |
|
"learning_rate": 0.00019409976553623766, |
|
"loss": 0.9812, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06726415455341339, |
|
"learning_rate": 0.00019386386370023103, |
|
"loss": 0.9837, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.057408351451158524, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 0.9512, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.057638928294181824, |
|
"learning_rate": 0.00019337864708701357, |
|
"loss": 0.9622, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06090644374489784, |
|
"learning_rate": 0.00019312935544168048, |
|
"loss": 0.9927, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.05811937153339386, |
|
"learning_rate": 0.00019287562401253022, |
|
"loss": 0.9905, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.05927939713001251, |
|
"learning_rate": 0.00019261746489577765, |
|
"loss": 0.9604, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.06021604314446449, |
|
"learning_rate": 0.0001923548903987201, |
|
"loss": 0.9535, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.07004135102033615, |
|
"learning_rate": 0.00019208791303915063, |
|
"loss": 1.0032, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.05626143515110016, |
|
"learning_rate": 0.0001918165455447614, |
|
"loss": 0.9726, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06130916625261307, |
|
"learning_rate": 0.00019154080085253666, |
|
"loss": 0.9646, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06467108428478241, |
|
"learning_rate": 0.0001912606921081362, |
|
"loss": 0.9516, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06047213450074196, |
|
"learning_rate": 0.0001909762326652686, |
|
"loss": 0.9664, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.05907664820551872, |
|
"learning_rate": 0.00019068743608505455, |
|
"loss": 0.9796, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06679921597242355, |
|
"learning_rate": 0.00019039431613538047, |
|
"loss": 0.9678, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06133173033595085, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.9875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.05841493234038353, |
|
"learning_rate": 0.00018979516222907775, |
|
"loss": 0.9686, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.0660431906580925, |
|
"learning_rate": 0.00018948915683609388, |
|
"loss": 0.9863, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.05776617303490639, |
|
"learning_rate": 0.00018917888519957754, |
|
"loss": 0.9417, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.05937017872929573, |
|
"learning_rate": 0.00018886436211120193, |
|
"loss": 0.9995, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06314114481210709, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 0.9806, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.060519032180309296, |
|
"learning_rate": 0.00018822262175825462, |
|
"loss": 0.9741, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06154269725084305, |
|
"learning_rate": 0.00018789543508756408, |
|
"loss": 0.9793, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06176121160387993, |
|
"learning_rate": 0.00018756405815131813, |
|
"loss": 0.9453, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06044905260205269, |
|
"learning_rate": 0.00018722850674734927, |
|
"loss": 0.9462, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.05896229296922684, |
|
"learning_rate": 0.00018688879687250067, |
|
"loss": 0.9963, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06071419641375542, |
|
"learning_rate": 0.0001865449447218635, |
|
"loss": 0.9914, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0697932317852974, |
|
"learning_rate": 0.00018619696668800492, |
|
"loss": 0.9726, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.062443289905786514, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 1.0084, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.059460923075675964, |
|
"learning_rate": 0.0001854886995235738, |
|
"loss": 0.9404, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.058260347694158554, |
|
"learning_rate": 0.00018512844415843514, |
|
"loss": 0.9796, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.05946533381938934, |
|
"learning_rate": 0.00018476413043933313, |
|
"loss": 0.9418, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.06572849303483963, |
|
"learning_rate": 0.00018439577573430555, |
|
"loss": 0.9785, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06783867627382278, |
|
"learning_rate": 0.00018402339760403713, |
|
"loss": 0.9747, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06454402953386307, |
|
"learning_rate": 0.00018364701380102266, |
|
"loss": 0.9779, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.06309663504362106, |
|
"learning_rate": 0.00018326664226872065, |
|
"loss": 0.9643, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.05967305600643158, |
|
"learning_rate": 0.00018288230114069765, |
|
"loss": 0.9752, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.05811592936515808, |
|
"learning_rate": 0.0001824940087397641, |
|
"loss": 0.9551, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.0642295628786087, |
|
"learning_rate": 0.00018210178357710058, |
|
"loss": 0.9522, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.05724099278450012, |
|
"learning_rate": 0.0001817056443513754, |
|
"loss": 1.0051, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.056155964732170105, |
|
"learning_rate": 0.00018130560994785325, |
|
"loss": 0.9778, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.058100346475839615, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.9825, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06120794638991356, |
|
"learning_rate": 0.00018049393207604733, |
|
"loss": 0.9839, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.056975312530994415, |
|
"learning_rate": 0.00018008232730312723, |
|
"loss": 0.9968, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06239038705825806, |
|
"learning_rate": 0.00017966690474129285, |
|
"loss": 0.9906, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5958348512649536, |
|
"learning_rate": 0.00017924768419510904, |
|
"loss": 2.6531, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06554935872554779, |
|
"learning_rate": 0.00017882468565020326, |
|
"loss": 1.0164, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.05698655918240547, |
|
"learning_rate": 0.00017839792927231254, |
|
"loss": 0.9516, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.06186239421367645, |
|
"learning_rate": 0.00017796743540632223, |
|
"loss": 0.9933, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.05811876431107521, |
|
"learning_rate": 0.00017753322457529614, |
|
"loss": 0.9552, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06247268617153168, |
|
"learning_rate": 0.00017709531747949796, |
|
"loss": 0.9316, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06278502196073532, |
|
"learning_rate": 0.00017665373499540463, |
|
"loss": 0.9867, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06079186499118805, |
|
"learning_rate": 0.00017620849817471092, |
|
"loss": 1.0233, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.05586745962500572, |
|
"learning_rate": 0.00017575962824332596, |
|
"loss": 0.9454, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.059647805988788605, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 0.9718, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.060143355280160904, |
|
"learning_rate": 0.00017485107481711012, |
|
"loss": 0.9927, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4768543541431427, |
|
"learning_rate": 0.0001743914346360205, |
|
"loss": 2.4526, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.059104178100824356, |
|
"learning_rate": 0.00017392824796965702, |
|
"loss": 0.9366, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06858639419078827, |
|
"learning_rate": 0.00017346153689965727, |
|
"loss": 0.9783, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06308155506849289, |
|
"learning_rate": 0.00017299132367567857, |
|
"loss": 0.9688, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.0601269006729126, |
|
"learning_rate": 0.00017251763071433765, |
|
"loss": 0.9937, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06544536352157593, |
|
"learning_rate": 0.00017204048059814175, |
|
"loss": 0.9351, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06467759609222412, |
|
"learning_rate": 0.00017155989607441213, |
|
"loss": 0.9918, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.061619073152542114, |
|
"learning_rate": 0.0001710759000541995, |
|
"loss": 0.9872, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06122846156358719, |
|
"learning_rate": 0.00017058851561119198, |
|
"loss": 0.968, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.08277314156293869, |
|
"learning_rate": 0.00017009776598061495, |
|
"loss": 0.9869, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.07559008151292801, |
|
"learning_rate": 0.00016960367455812336, |
|
"loss": 0.9804, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06251110136508942, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 0.978, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06253345310688019, |
|
"learning_rate": 0.0001686055607154648, |
|
"loss": 0.9524, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.05948334559798241, |
|
"learning_rate": 0.00016810158587867973, |
|
"loss": 0.9826, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06356865167617798, |
|
"learning_rate": 0.00016759436441447545, |
|
"loss": 0.9805, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06536010652780533, |
|
"learning_rate": 0.00016708392050377363, |
|
"loss": 1.0146, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06137322261929512, |
|
"learning_rate": 0.00016657027848112062, |
|
"loss": 0.9457, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.05955340713262558, |
|
"learning_rate": 0.00016605346283352727, |
|
"loss": 0.9823, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.0602475143969059, |
|
"learning_rate": 0.00016553349819930165, |
|
"loss": 1.0077, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.05905039981007576, |
|
"learning_rate": 0.00016501040936687443, |
|
"loss": 0.9313, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06087419390678406, |
|
"learning_rate": 0.00016448422127361706, |
|
"loss": 0.9725, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.059714119881391525, |
|
"learning_rate": 0.00016395495900465304, |
|
"loss": 0.9963, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06461544334888458, |
|
"learning_rate": 0.000163422647791662, |
|
"loss": 0.957, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06247726082801819, |
|
"learning_rate": 0.00016288731301167668, |
|
"loss": 0.9742, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06527213007211685, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.9789, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.059757690876722336, |
|
"learning_rate": 0.00016180767497835503, |
|
"loss": 0.9309, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.059084221720695496, |
|
"learning_rate": 0.00016126342319492784, |
|
"loss": 0.9546, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.778961181640625, |
|
"learning_rate": 0.00016071625078187114, |
|
"loss": 2.6066, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06468740105628967, |
|
"learning_rate": 0.00016016618382470012, |
|
"loss": 0.9472, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.061647091060876846, |
|
"learning_rate": 0.00015961324854692254, |
|
"loss": 0.9836, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06061193719506264, |
|
"learning_rate": 0.0001590574713087885, |
|
"loss": 0.982, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06635820865631104, |
|
"learning_rate": 0.00015849887860603374, |
|
"loss": 0.9873, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.06260058283805847, |
|
"learning_rate": 0.00015793749706861636, |
|
"loss": 0.9827, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06037148833274841, |
|
"learning_rate": 0.00015737335345944757, |
|
"loss": 1.0072, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06277037411928177, |
|
"learning_rate": 0.00015680647467311557, |
|
"loss": 0.9498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06306284666061401, |
|
"learning_rate": 0.00015623688773460357, |
|
"loss": 0.9866, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.07311715185642242, |
|
"learning_rate": 0.00015566461979800122, |
|
"loss": 0.9722, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.06143077090382576, |
|
"learning_rate": 0.00015508969814521025, |
|
"loss": 0.9442, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.067879818379879, |
|
"learning_rate": 0.00015451215018464387, |
|
"loss": 0.9416, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06907378137111664, |
|
"learning_rate": 0.00015393200344991995, |
|
"loss": 0.9813, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06299201399087906, |
|
"learning_rate": 0.0001533492855985485, |
|
"loss": 0.9684, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.059243012219667435, |
|
"learning_rate": 0.0001527640244106133, |
|
"loss": 0.9883, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.06446617841720581, |
|
"learning_rate": 0.00015217624778744718, |
|
"loss": 0.9836, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.07795852422714233, |
|
"learning_rate": 0.00015158598375030217, |
|
"loss": 0.9682, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.0846932977437973, |
|
"learning_rate": 0.0001509932604390136, |
|
"loss": 0.8964, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.06378539651632309, |
|
"learning_rate": 0.0001503981061106584, |
|
"loss": 0.8712, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.06959015876054764, |
|
"learning_rate": 0.00014980054913820814, |
|
"loss": 0.8538, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.11089441180229187, |
|
"learning_rate": 0.00014920061800917638, |
|
"loss": 0.8898, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.10314545780420303, |
|
"learning_rate": 0.0001485983413242606, |
|
"loss": 0.858, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.06961672008037567, |
|
"learning_rate": 0.00014799374779597867, |
|
"loss": 0.8605, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.07029856741428375, |
|
"learning_rate": 0.00014738686624729986, |
|
"loss": 0.8536, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.07742595672607422, |
|
"learning_rate": 0.0001467777256102712, |
|
"loss": 0.875, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.06792555004358292, |
|
"learning_rate": 0.00014616635492463776, |
|
"loss": 0.8829, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.08564931154251099, |
|
"learning_rate": 0.00014555278333645833, |
|
"loss": 0.9109, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.0962703675031662, |
|
"learning_rate": 0.00014493704009671613, |
|
"loss": 0.8707, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.09053023904561996, |
|
"learning_rate": 0.00014431915455992414, |
|
"loss": 0.9005, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.07491155713796616, |
|
"learning_rate": 0.00014369915618272567, |
|
"loss": 0.8805, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.08745575696229935, |
|
"learning_rate": 0.00014307707452249012, |
|
"loss": 0.8766, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.08251971006393433, |
|
"learning_rate": 0.0001424529392359039, |
|
"loss": 0.8939, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.07616613060235977, |
|
"learning_rate": 0.0001418267800775565, |
|
"loss": 0.8678, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.08193603903055191, |
|
"learning_rate": 0.00014119862689852223, |
|
"loss": 0.8473, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.08167202025651932, |
|
"learning_rate": 0.0001405685096449367, |
|
"loss": 0.8669, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.08531814068555832, |
|
"learning_rate": 0.00013993645835656953, |
|
"loss": 0.8637, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.08357888460159302, |
|
"learning_rate": 0.00013930250316539238, |
|
"loss": 0.846, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.583361804485321, |
|
"learning_rate": 0.0001386666742941419, |
|
"loss": 2.3925, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.08598628640174866, |
|
"learning_rate": 0.00013802900205487948, |
|
"loss": 0.8702, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.0913216769695282, |
|
"learning_rate": 0.00013738951684754585, |
|
"loss": 0.8492, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.08879151940345764, |
|
"learning_rate": 0.00013674824915851192, |
|
"loss": 0.8721, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.09359906613826752, |
|
"learning_rate": 0.0001361052295591255, |
|
"loss": 0.8678, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.09475123882293701, |
|
"learning_rate": 0.00013546048870425356, |
|
"loss": 0.8514, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.1002567857503891, |
|
"learning_rate": 0.00013481405733082116, |
|
"loss": 0.8688, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.09523095935583115, |
|
"learning_rate": 0.00013416596625634593, |
|
"loss": 0.8776, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.09465932101011276, |
|
"learning_rate": 0.00013351624637746886, |
|
"loss": 0.8965, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.09145588427782059, |
|
"learning_rate": 0.00013286492866848142, |
|
"loss": 0.8587, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.09743087738752365, |
|
"learning_rate": 0.00013221204417984908, |
|
"loss": 0.823, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.09468277543783188, |
|
"learning_rate": 0.00013155762403673063, |
|
"loss": 0.8578, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.09516102075576782, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.8944, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.09514753520488739, |
|
"learning_rate": 0.00013024430165223244, |
|
"loss": 0.8175, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.10116668790578842, |
|
"learning_rate": 0.0001295854620212664, |
|
"loss": 0.8572, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.09801662713289261, |
|
"learning_rate": 0.00012892521195365678, |
|
"loss": 0.8532, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.09754762053489685, |
|
"learning_rate": 0.00012826358292570398, |
|
"loss": 0.8653, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.10124494135379791, |
|
"learning_rate": 0.00012760060647944795, |
|
"loss": 0.9007, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.09749335795640945, |
|
"learning_rate": 0.00012693631422116454, |
|
"loss": 0.8304, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.09953852742910385, |
|
"learning_rate": 0.0001262707378198587, |
|
"loss": 0.8759, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.10160723328590393, |
|
"learning_rate": 0.0001256039090057547, |
|
"loss": 0.8923, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.1000576838850975, |
|
"learning_rate": 0.00012493585956878354, |
|
"loss": 0.8397, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.10324119031429291, |
|
"learning_rate": 0.0001242666213570672, |
|
"loss": 0.8787, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.1018192246556282, |
|
"learning_rate": 0.00012359622627540058, |
|
"loss": 0.8318, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.10028495639562607, |
|
"learning_rate": 0.00012292470628373037, |
|
"loss": 0.8354, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.10520325601100922, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.8586, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.10229421406984329, |
|
"learning_rate": 0.00012157841967678063, |
|
"loss": 0.8961, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.10230256617069244, |
|
"learning_rate": 0.00012090371724342804, |
|
"loss": 0.8851, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.10282580554485321, |
|
"learning_rate": 0.00012022801826086609, |
|
"loss": 0.8588, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.1008041650056839, |
|
"learning_rate": 0.00011955135494189588, |
|
"loss": 0.8462, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.10294941067695618, |
|
"learning_rate": 0.00011887375954529168, |
|
"loss": 0.8301, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.10411438345909119, |
|
"learning_rate": 0.00011819526437426298, |
|
"loss": 0.8299, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.10526308417320251, |
|
"learning_rate": 0.0001175159017749144, |
|
"loss": 0.8624, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.10539798438549042, |
|
"learning_rate": 0.00011683570413470383, |
|
"loss": 0.8473, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.09961718320846558, |
|
"learning_rate": 0.00011615470388089835, |
|
"loss": 0.8571, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.10262471437454224, |
|
"learning_rate": 0.00011547293347902812, |
|
"loss": 0.8539, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.10586554557085037, |
|
"learning_rate": 0.00011479042543133895, |
|
"loss": 0.8559, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.10486755520105362, |
|
"learning_rate": 0.00011410721227524255, |
|
"loss": 0.8726, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.10485829412937164, |
|
"learning_rate": 0.00011342332658176555, |
|
"loss": 0.8903, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.10306116938591003, |
|
"learning_rate": 0.00011273880095399667, |
|
"loss": 0.8495, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.10205783694982529, |
|
"learning_rate": 0.0001120536680255323, |
|
"loss": 0.86, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.10305880010128021, |
|
"learning_rate": 0.00011136796045892102, |
|
"loss": 0.8617, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.10516184568405151, |
|
"learning_rate": 0.00011068171094410618, |
|
"loss": 0.8942, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.10830646008253098, |
|
"learning_rate": 0.00010999495219686762, |
|
"loss": 0.8853, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.10334376990795135, |
|
"learning_rate": 0.00010930771695726201, |
|
"loss": 0.8183, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.10024702548980713, |
|
"learning_rate": 0.00010862003798806196, |
|
"loss": 0.8503, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.10792502015829086, |
|
"learning_rate": 0.00010793194807319408, |
|
"loss": 0.8746, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.10143101215362549, |
|
"learning_rate": 0.00010724348001617625, |
|
"loss": 0.8321, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.09850587695837021, |
|
"learning_rate": 0.00010655466663855349, |
|
"loss": 0.8649, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.10333626717329025, |
|
"learning_rate": 0.00010586554077833347, |
|
"loss": 0.8722, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.10461243987083435, |
|
"learning_rate": 0.00010517613528842097, |
|
"loss": 0.8961, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.10886025428771973, |
|
"learning_rate": 0.00010448648303505151, |
|
"loss": 0.8921, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.10471045225858688, |
|
"learning_rate": 0.00010379661689622477, |
|
"loss": 0.8647, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.10855681449174881, |
|
"learning_rate": 0.00010310656976013705, |
|
"loss": 0.8783, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.11153094470500946, |
|
"learning_rate": 0.00010241637452361323, |
|
"loss": 0.8732, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.10941874235868454, |
|
"learning_rate": 0.00010172606409053886, |
|
"loss": 0.8555, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.10388068854808807, |
|
"learning_rate": 0.0001010356713702911, |
|
"loss": 0.8549, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.1045478880405426, |
|
"learning_rate": 0.00010034522927617014, |
|
"loss": 0.8706, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.10188283771276474, |
|
"learning_rate": 9.96547707238299e-05, |
|
"loss": 0.8916, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.10076084733009338, |
|
"learning_rate": 9.896432862970892e-05, |
|
"loss": 0.8841, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.10353115946054459, |
|
"learning_rate": 9.827393590946116e-05, |
|
"loss": 0.8723, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.10559534281492233, |
|
"learning_rate": 9.75836254763868e-05, |
|
"loss": 0.8679, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.10226694494485855, |
|
"learning_rate": 9.689343023986302e-05, |
|
"loss": 0.8434, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.10569630563259125, |
|
"learning_rate": 9.620338310377525e-05, |
|
"loss": 0.8768, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.10112857073545456, |
|
"learning_rate": 9.551351696494854e-05, |
|
"loss": 0.8249, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.10926086455583572, |
|
"learning_rate": 9.482386471157904e-05, |
|
"loss": 0.8634, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.1026775911450386, |
|
"learning_rate": 9.413445922166653e-05, |
|
"loss": 0.8679, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.10021597146987915, |
|
"learning_rate": 9.344533336144652e-05, |
|
"loss": 0.8596, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.1025286465883255, |
|
"learning_rate": 9.275651998382377e-05, |
|
"loss": 0.8302, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.0997297540307045, |
|
"learning_rate": 9.206805192680593e-05, |
|
"loss": 0.8818, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.10259710997343063, |
|
"learning_rate": 9.137996201193805e-05, |
|
"loss": 0.8558, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.1076403483748436, |
|
"learning_rate": 9.069228304273802e-05, |
|
"loss": 0.8922, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.10561492294073105, |
|
"learning_rate": 9.00050478031324e-05, |
|
"loss": 0.8593, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.10473283380270004, |
|
"learning_rate": 8.931828905589385e-05, |
|
"loss": 0.8893, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.11126771569252014, |
|
"learning_rate": 8.863203954107902e-05, |
|
"loss": 0.8761, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.10318706929683685, |
|
"learning_rate": 8.79463319744677e-05, |
|
"loss": 0.8688, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.10156065970659256, |
|
"learning_rate": 8.726119904600336e-05, |
|
"loss": 0.8683, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.0986093133687973, |
|
"learning_rate": 8.657667341823448e-05, |
|
"loss": 0.8375, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.10292702913284302, |
|
"learning_rate": 8.589278772475749e-05, |
|
"loss": 0.8623, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.1031164601445198, |
|
"learning_rate": 8.520957456866107e-05, |
|
"loss": 0.8633, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.0997462198138237, |
|
"learning_rate": 8.452706652097186e-05, |
|
"loss": 0.8462, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.11357836425304413, |
|
"learning_rate": 8.384529611910163e-05, |
|
"loss": 0.8316, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.10290369391441345, |
|
"learning_rate": 8.316429586529615e-05, |
|
"loss": 0.8357, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.10479974746704102, |
|
"learning_rate": 8.248409822508561e-05, |
|
"loss": 0.8591, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.1035972386598587, |
|
"learning_rate": 8.180473562573705e-05, |
|
"loss": 0.8351, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.10530169308185577, |
|
"learning_rate": 8.112624045470835e-05, |
|
"loss": 0.8387, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.09917616844177246, |
|
"learning_rate": 8.044864505810414e-05, |
|
"loss": 0.8699, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.10452553629875183, |
|
"learning_rate": 7.977198173913394e-05, |
|
"loss": 0.859, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.10981190949678421, |
|
"learning_rate": 7.909628275657198e-05, |
|
"loss": 0.8443, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.10094719380140305, |
|
"learning_rate": 7.84215803232194e-05, |
|
"loss": 0.8461, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.11014903336763382, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.8409, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.10279539972543716, |
|
"learning_rate": 7.707529371626965e-05, |
|
"loss": 0.8588, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.10260649770498276, |
|
"learning_rate": 7.640377372459945e-05, |
|
"loss": 0.8702, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.10066790878772736, |
|
"learning_rate": 7.573337864293283e-05, |
|
"loss": 0.8523, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.10237132757902145, |
|
"learning_rate": 7.506414043121647e-05, |
|
"loss": 0.8676, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.10095226764678955, |
|
"learning_rate": 7.43960909942453e-05, |
|
"loss": 0.8802, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.10220375657081604, |
|
"learning_rate": 7.372926218014131e-05, |
|
"loss": 0.8347, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.09893085062503815, |
|
"learning_rate": 7.306368577883547e-05, |
|
"loss": 0.8565, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.10145033895969391, |
|
"learning_rate": 7.239939352055208e-05, |
|
"loss": 0.8565, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.10070981830358505, |
|
"learning_rate": 7.173641707429606e-05, |
|
"loss": 0.8475, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.10595450550317764, |
|
"learning_rate": 7.107478804634325e-05, |
|
"loss": 0.8368, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.1085483580827713, |
|
"learning_rate": 7.041453797873363e-05, |
|
"loss": 0.866, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.1005505919456482, |
|
"learning_rate": 6.975569834776758e-05, |
|
"loss": 0.8393, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.10384807735681534, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.8571, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.10558132082223892, |
|
"learning_rate": 6.844237596326941e-05, |
|
"loss": 0.8477, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.10049811750650406, |
|
"learning_rate": 6.778795582015097e-05, |
|
"loss": 0.8317, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.10088322311639786, |
|
"learning_rate": 6.713507133151857e-05, |
|
"loss": 0.8602, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.09972478449344635, |
|
"learning_rate": 6.648375362253118e-05, |
|
"loss": 0.8514, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.10518650710582733, |
|
"learning_rate": 6.583403374365405e-05, |
|
"loss": 0.8732, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.10432999581098557, |
|
"learning_rate": 6.518594266917882e-05, |
|
"loss": 0.9014, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.10011251270771027, |
|
"learning_rate": 6.453951129574644e-05, |
|
"loss": 0.8192, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.09896595031023026, |
|
"learning_rate": 6.389477044087452e-05, |
|
"loss": 0.8148, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.10062045603990555, |
|
"learning_rate": 6.325175084148809e-05, |
|
"loss": 0.8303, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.1033136248588562, |
|
"learning_rate": 6.261048315245419e-05, |
|
"loss": 0.8537, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.10438770055770874, |
|
"learning_rate": 6.197099794512056e-05, |
|
"loss": 0.8881, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.1036936491727829, |
|
"learning_rate": 6.133332570585812e-05, |
|
"loss": 0.85, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.10618463903665543, |
|
"learning_rate": 6.069749683460765e-05, |
|
"loss": 0.8618, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.10110071301460266, |
|
"learning_rate": 6.006354164343046e-05, |
|
"loss": 0.8632, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.10008033365011215, |
|
"learning_rate": 5.943149035506337e-05, |
|
"loss": 0.8648, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.09985347092151642, |
|
"learning_rate": 5.880137310147782e-05, |
|
"loss": 0.893, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.09989628195762634, |
|
"learning_rate": 5.817321992244351e-05, |
|
"loss": 0.8682, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.10163696855306625, |
|
"learning_rate": 5.754706076409613e-05, |
|
"loss": 0.8554, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.10319127887487411, |
|
"learning_rate": 5.692292547750988e-05, |
|
"loss": 0.8603, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.10187959671020508, |
|
"learning_rate": 5.630084381727434e-05, |
|
"loss": 0.8628, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.09999451041221619, |
|
"learning_rate": 5.568084544007588e-05, |
|
"loss": 0.8471, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.10175001621246338, |
|
"learning_rate": 5.506295990328385e-05, |
|
"loss": 0.8371, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.10206517577171326, |
|
"learning_rate": 5.444721666354169e-05, |
|
"loss": 0.8836, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.10593275725841522, |
|
"learning_rate": 5.383364507536229e-05, |
|
"loss": 0.8397, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.10138902068138123, |
|
"learning_rate": 5.32222743897288e-05, |
|
"loss": 0.8749, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.1002379059791565, |
|
"learning_rate": 5.261313375270014e-05, |
|
"loss": 0.8338, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.11006593704223633, |
|
"learning_rate": 5.200625220402139e-05, |
|
"loss": 0.8779, |
|
"step": 310 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 465, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 155, |
|
"total_flos": 6.721047353232458e+18, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|