{ "best_metric": 1.5823931694030762, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.11293054771315642, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000564652738565782, "grad_norm": 0.5402987599372864, "learning_rate": 1.013e-05, "loss": 1.4633, "step": 1 }, { "epoch": 0.000564652738565782, "eval_loss": 2.135667562484741, "eval_runtime": 33.7351, "eval_samples_per_second": 22.113, "eval_steps_per_second": 5.543, "step": 1 }, { "epoch": 0.001129305477131564, "grad_norm": 0.36689239740371704, "learning_rate": 2.026e-05, "loss": 1.8992, "step": 2 }, { "epoch": 0.0016939582156973462, "grad_norm": 0.3763646185398102, "learning_rate": 3.039e-05, "loss": 0.8535, "step": 3 }, { "epoch": 0.002258610954263128, "grad_norm": 0.34807297587394714, "learning_rate": 4.052e-05, "loss": 1.7749, "step": 4 }, { "epoch": 0.00282326369282891, "grad_norm": 0.5206761360168457, "learning_rate": 5.065e-05, "loss": 1.8843, "step": 5 }, { "epoch": 0.0033879164313946925, "grad_norm": 0.3446935713291168, "learning_rate": 6.078e-05, "loss": 2.4299, "step": 6 }, { "epoch": 0.003952569169960474, "grad_norm": 0.353129506111145, "learning_rate": 7.091e-05, "loss": 1.9346, "step": 7 }, { "epoch": 0.004517221908526256, "grad_norm": 0.2856239378452301, "learning_rate": 8.104e-05, "loss": 1.7326, "step": 8 }, { "epoch": 0.005081874647092038, "grad_norm": 0.4531644880771637, "learning_rate": 9.117e-05, "loss": 1.7827, "step": 9 }, { "epoch": 0.00564652738565782, "grad_norm": 0.47059357166290283, "learning_rate": 0.0001013, "loss": 2.4921, "step": 10 }, { "epoch": 0.006211180124223602, "grad_norm": 0.4820435345172882, "learning_rate": 0.00010076684210526316, "loss": 1.8598, "step": 11 }, { "epoch": 0.006775832862789385, "grad_norm": 0.37354958057403564, "learning_rate": 0.0001002336842105263, "loss": 1.7825, "step": 12 }, { "epoch": 0.007340485601355167, "grad_norm": 0.3483826220035553, "learning_rate": 9.970052631578946e-05, "loss": 2.0361, "step": 13 }, { "epoch": 0.007905138339920948, "grad_norm": 0.45971325039863586, "learning_rate": 9.916736842105263e-05, "loss": 1.9319, "step": 14 }, { "epoch": 0.00846979107848673, "grad_norm": 0.9301943182945251, "learning_rate": 9.863421052631579e-05, "loss": 1.6979, "step": 15 }, { "epoch": 0.009034443817052512, "grad_norm": 0.32556048035621643, "learning_rate": 9.810105263157895e-05, "loss": 2.2257, "step": 16 }, { "epoch": 0.009599096555618294, "grad_norm": 0.46048063039779663, "learning_rate": 9.756789473684211e-05, "loss": 1.7091, "step": 17 }, { "epoch": 0.010163749294184076, "grad_norm": 0.3488789498806, "learning_rate": 9.703473684210525e-05, "loss": 2.019, "step": 18 }, { "epoch": 0.010728402032749858, "grad_norm": 0.5194416642189026, "learning_rate": 9.650157894736842e-05, "loss": 1.9075, "step": 19 }, { "epoch": 0.01129305477131564, "grad_norm": 0.46956583857536316, "learning_rate": 9.596842105263158e-05, "loss": 1.8816, "step": 20 }, { "epoch": 0.011857707509881422, "grad_norm": 0.4933469891548157, "learning_rate": 9.543526315789474e-05, "loss": 1.729, "step": 21 }, { "epoch": 0.012422360248447204, "grad_norm": 0.3997938930988312, "learning_rate": 9.49021052631579e-05, "loss": 1.6959, "step": 22 }, { "epoch": 0.012987012987012988, "grad_norm": 0.5408119559288025, "learning_rate": 9.436894736842105e-05, "loss": 2.0254, "step": 23 }, { "epoch": 0.01355166572557877, "grad_norm": 0.41266539692878723, "learning_rate": 9.38357894736842e-05, "loss": 1.9554, "step": 24 }, { "epoch": 0.014116318464144552, "grad_norm": 0.5079236030578613, "learning_rate": 9.330263157894737e-05, "loss": 2.0457, "step": 25 }, { "epoch": 0.014680971202710334, "grad_norm": 0.5814744830131531, "learning_rate": 9.276947368421051e-05, "loss": 2.0399, "step": 26 }, { "epoch": 0.015245623941276116, "grad_norm": 0.6485530138015747, "learning_rate": 9.223631578947369e-05, "loss": 1.4858, "step": 27 }, { "epoch": 0.015810276679841896, "grad_norm": 0.5585585832595825, "learning_rate": 9.170315789473684e-05, "loss": 1.9193, "step": 28 }, { "epoch": 0.01637492941840768, "grad_norm": 0.6046280860900879, "learning_rate": 9.117e-05, "loss": 2.3537, "step": 29 }, { "epoch": 0.01693958215697346, "grad_norm": 0.5263966917991638, "learning_rate": 9.063684210526316e-05, "loss": 1.5335, "step": 30 }, { "epoch": 0.017504234895539244, "grad_norm": 0.8364748358726501, "learning_rate": 9.010368421052632e-05, "loss": 1.7004, "step": 31 }, { "epoch": 0.018068887634105024, "grad_norm": 0.5951048135757446, "learning_rate": 8.957052631578946e-05, "loss": 2.4489, "step": 32 }, { "epoch": 0.018633540372670808, "grad_norm": 0.5281526446342468, "learning_rate": 8.903736842105263e-05, "loss": 1.1862, "step": 33 }, { "epoch": 0.019198193111236588, "grad_norm": 0.8356189727783203, "learning_rate": 8.850421052631579e-05, "loss": 1.5012, "step": 34 }, { "epoch": 0.019762845849802372, "grad_norm": 0.6029136180877686, "learning_rate": 8.797105263157895e-05, "loss": 1.4578, "step": 35 }, { "epoch": 0.020327498588368152, "grad_norm": 0.6786731481552124, "learning_rate": 8.743789473684211e-05, "loss": 1.547, "step": 36 }, { "epoch": 0.020892151326933936, "grad_norm": 0.877357542514801, "learning_rate": 8.690473684210526e-05, "loss": 1.7824, "step": 37 }, { "epoch": 0.021456804065499716, "grad_norm": 1.265729546546936, "learning_rate": 8.637157894736842e-05, "loss": 2.1181, "step": 38 }, { "epoch": 0.0220214568040655, "grad_norm": 1.4670414924621582, "learning_rate": 8.583842105263158e-05, "loss": 1.3309, "step": 39 }, { "epoch": 0.02258610954263128, "grad_norm": 1.3509232997894287, "learning_rate": 8.530526315789472e-05, "loss": 2.5148, "step": 40 }, { "epoch": 0.023150762281197064, "grad_norm": 1.4195291996002197, "learning_rate": 8.47721052631579e-05, "loss": 2.0859, "step": 41 }, { "epoch": 0.023715415019762844, "grad_norm": 1.6967036724090576, "learning_rate": 8.423894736842105e-05, "loss": 1.6782, "step": 42 }, { "epoch": 0.024280067758328628, "grad_norm": 1.9785702228546143, "learning_rate": 8.37057894736842e-05, "loss": 1.8815, "step": 43 }, { "epoch": 0.024844720496894408, "grad_norm": 2.209860324859619, "learning_rate": 8.317263157894737e-05, "loss": 1.5545, "step": 44 }, { "epoch": 0.025409373235460192, "grad_norm": 2.2609925270080566, "learning_rate": 8.263947368421053e-05, "loss": 1.3009, "step": 45 }, { "epoch": 0.025974025974025976, "grad_norm": 3.240757703781128, "learning_rate": 8.210631578947368e-05, "loss": 2.1366, "step": 46 }, { "epoch": 0.026538678712591756, "grad_norm": 3.1858623027801514, "learning_rate": 8.157315789473684e-05, "loss": 1.119, "step": 47 }, { "epoch": 0.02710333145115754, "grad_norm": 3.827558994293213, "learning_rate": 8.104e-05, "loss": 1.5671, "step": 48 }, { "epoch": 0.02766798418972332, "grad_norm": 4.33996057510376, "learning_rate": 8.050684210526316e-05, "loss": 1.3891, "step": 49 }, { "epoch": 0.028232636928289104, "grad_norm": 5.596718788146973, "learning_rate": 7.997368421052632e-05, "loss": 1.572, "step": 50 }, { "epoch": 0.028232636928289104, "eval_loss": 1.7320042848587036, "eval_runtime": 33.7715, "eval_samples_per_second": 22.09, "eval_steps_per_second": 5.537, "step": 50 }, { "epoch": 0.028797289666854884, "grad_norm": 0.710443913936615, "learning_rate": 7.944052631578947e-05, "loss": 2.2345, "step": 51 }, { "epoch": 0.029361942405420668, "grad_norm": 0.6612917184829712, "learning_rate": 7.890736842105263e-05, "loss": 2.0899, "step": 52 }, { "epoch": 0.029926595143986448, "grad_norm": 0.7144291996955872, "learning_rate": 7.837421052631579e-05, "loss": 0.9424, "step": 53 }, { "epoch": 0.030491247882552232, "grad_norm": 0.6780564188957214, "learning_rate": 7.784105263157893e-05, "loss": 2.0563, "step": 54 }, { "epoch": 0.031055900621118012, "grad_norm": 0.7346010208129883, "learning_rate": 7.730789473684211e-05, "loss": 1.8291, "step": 55 }, { "epoch": 0.03162055335968379, "grad_norm": 0.4565852880477905, "learning_rate": 7.677473684210526e-05, "loss": 1.2565, "step": 56 }, { "epoch": 0.032185206098249576, "grad_norm": 0.5821838974952698, "learning_rate": 7.624157894736842e-05, "loss": 1.6761, "step": 57 }, { "epoch": 0.03274985883681536, "grad_norm": 1.3336795568466187, "learning_rate": 7.570842105263158e-05, "loss": 1.9984, "step": 58 }, { "epoch": 0.033314511575381144, "grad_norm": 0.7882184982299805, "learning_rate": 7.517526315789474e-05, "loss": 0.7926, "step": 59 }, { "epoch": 0.03387916431394692, "grad_norm": 0.4720506966114044, "learning_rate": 7.464210526315789e-05, "loss": 1.5786, "step": 60 }, { "epoch": 0.034443817052512704, "grad_norm": 0.4207099974155426, "learning_rate": 7.410894736842106e-05, "loss": 1.1193, "step": 61 }, { "epoch": 0.03500846979107849, "grad_norm": 0.5080116391181946, "learning_rate": 7.35757894736842e-05, "loss": 1.6818, "step": 62 }, { "epoch": 0.03557312252964427, "grad_norm": 0.5660998225212097, "learning_rate": 7.304263157894737e-05, "loss": 1.8537, "step": 63 }, { "epoch": 0.03613777526821005, "grad_norm": 0.4343119263648987, "learning_rate": 7.250947368421053e-05, "loss": 1.3433, "step": 64 }, { "epoch": 0.03670242800677583, "grad_norm": 0.561114490032196, "learning_rate": 7.197631578947368e-05, "loss": 1.6089, "step": 65 }, { "epoch": 0.037267080745341616, "grad_norm": 0.41408398747444153, "learning_rate": 7.144315789473684e-05, "loss": 2.1082, "step": 66 }, { "epoch": 0.0378317334839074, "grad_norm": 0.40642455220222473, "learning_rate": 7.091e-05, "loss": 1.952, "step": 67 }, { "epoch": 0.038396386222473176, "grad_norm": 0.49146607518196106, "learning_rate": 7.037684210526316e-05, "loss": 1.929, "step": 68 }, { "epoch": 0.03896103896103896, "grad_norm": 0.43934860825538635, "learning_rate": 6.984368421052632e-05, "loss": 1.8151, "step": 69 }, { "epoch": 0.039525691699604744, "grad_norm": 0.39021074771881104, "learning_rate": 6.931052631578947e-05, "loss": 1.7151, "step": 70 }, { "epoch": 0.04009034443817053, "grad_norm": 0.5213768482208252, "learning_rate": 6.877736842105263e-05, "loss": 2.1129, "step": 71 }, { "epoch": 0.040654997176736304, "grad_norm": 0.42245039343833923, "learning_rate": 6.824421052631579e-05, "loss": 2.1972, "step": 72 }, { "epoch": 0.04121964991530209, "grad_norm": 0.4925639033317566, "learning_rate": 6.771105263157895e-05, "loss": 2.3312, "step": 73 }, { "epoch": 0.04178430265386787, "grad_norm": 0.41615986824035645, "learning_rate": 6.71778947368421e-05, "loss": 1.9326, "step": 74 }, { "epoch": 0.042348955392433656, "grad_norm": 0.4477844834327698, "learning_rate": 6.664473684210527e-05, "loss": 1.9983, "step": 75 }, { "epoch": 0.04291360813099943, "grad_norm": 0.45667552947998047, "learning_rate": 6.611157894736842e-05, "loss": 1.7302, "step": 76 }, { "epoch": 0.043478260869565216, "grad_norm": 0.5657249093055725, "learning_rate": 6.557842105263158e-05, "loss": 1.3501, "step": 77 }, { "epoch": 0.044042913608131, "grad_norm": 0.48479339480400085, "learning_rate": 6.504526315789474e-05, "loss": 2.1838, "step": 78 }, { "epoch": 0.044607566346696784, "grad_norm": 0.5657017230987549, "learning_rate": 6.451210526315789e-05, "loss": 1.5316, "step": 79 }, { "epoch": 0.04517221908526256, "grad_norm": 0.6498041152954102, "learning_rate": 6.397894736842105e-05, "loss": 1.7707, "step": 80 }, { "epoch": 0.045736871823828344, "grad_norm": 0.5205366611480713, "learning_rate": 6.344578947368421e-05, "loss": 1.5243, "step": 81 }, { "epoch": 0.04630152456239413, "grad_norm": 0.6816235780715942, "learning_rate": 6.291263157894737e-05, "loss": 1.9903, "step": 82 }, { "epoch": 0.04686617730095991, "grad_norm": 0.5851727724075317, "learning_rate": 6.237947368421053e-05, "loss": 1.086, "step": 83 }, { "epoch": 0.04743083003952569, "grad_norm": 0.5946785807609558, "learning_rate": 6.184631578947368e-05, "loss": 1.6933, "step": 84 }, { "epoch": 0.04799548277809147, "grad_norm": 0.9167661666870117, "learning_rate": 6.131315789473684e-05, "loss": 1.3954, "step": 85 }, { "epoch": 0.048560135516657256, "grad_norm": 0.6855114698410034, "learning_rate": 6.078e-05, "loss": 1.6794, "step": 86 }, { "epoch": 0.04912478825522304, "grad_norm": 0.9877620935440063, "learning_rate": 6.024684210526315e-05, "loss": 1.5267, "step": 87 }, { "epoch": 0.049689440993788817, "grad_norm": 1.2663705348968506, "learning_rate": 5.9713684210526305e-05, "loss": 1.2142, "step": 88 }, { "epoch": 0.0502540937323546, "grad_norm": 0.8730637431144714, "learning_rate": 5.918052631578947e-05, "loss": 1.9942, "step": 89 }, { "epoch": 0.050818746470920384, "grad_norm": 1.022507905960083, "learning_rate": 5.8647368421052634e-05, "loss": 1.7698, "step": 90 }, { "epoch": 0.05138339920948617, "grad_norm": 1.0070384740829468, "learning_rate": 5.811421052631579e-05, "loss": 2.1813, "step": 91 }, { "epoch": 0.05194805194805195, "grad_norm": 1.0603997707366943, "learning_rate": 5.758105263157894e-05, "loss": 1.5043, "step": 92 }, { "epoch": 0.05251270468661773, "grad_norm": 1.5354747772216797, "learning_rate": 5.70478947368421e-05, "loss": 1.4871, "step": 93 }, { "epoch": 0.05307735742518351, "grad_norm": 1.4360660314559937, "learning_rate": 5.6514736842105256e-05, "loss": 1.1194, "step": 94 }, { "epoch": 0.053642010163749296, "grad_norm": 2.7259883880615234, "learning_rate": 5.5981578947368424e-05, "loss": 1.3859, "step": 95 }, { "epoch": 0.05420666290231508, "grad_norm": 2.0067079067230225, "learning_rate": 5.544842105263158e-05, "loss": 1.1774, "step": 96 }, { "epoch": 0.054771315640880856, "grad_norm": 1.9540057182312012, "learning_rate": 5.491526315789474e-05, "loss": 1.838, "step": 97 }, { "epoch": 0.05533596837944664, "grad_norm": 3.2931625843048096, "learning_rate": 5.438210526315789e-05, "loss": 1.8081, "step": 98 }, { "epoch": 0.055900621118012424, "grad_norm": 4.112388610839844, "learning_rate": 5.384894736842105e-05, "loss": 1.0519, "step": 99 }, { "epoch": 0.05646527385657821, "grad_norm": 4.771429061889648, "learning_rate": 5.331578947368421e-05, "loss": 2.072, "step": 100 }, { "epoch": 0.05646527385657821, "eval_loss": 1.61591374874115, "eval_runtime": 33.6813, "eval_samples_per_second": 22.149, "eval_steps_per_second": 5.552, "step": 100 }, { "epoch": 0.057029926595143984, "grad_norm": 0.26157695055007935, "learning_rate": 5.278263157894736e-05, "loss": 1.3786, "step": 101 }, { "epoch": 0.05759457933370977, "grad_norm": 0.31872278451919556, "learning_rate": 5.224947368421053e-05, "loss": 1.7082, "step": 102 }, { "epoch": 0.05815923207227555, "grad_norm": 0.40259334444999695, "learning_rate": 5.171631578947368e-05, "loss": 1.7991, "step": 103 }, { "epoch": 0.058723884810841336, "grad_norm": 0.5885865688323975, "learning_rate": 5.1183157894736844e-05, "loss": 1.5969, "step": 104 }, { "epoch": 0.05928853754940711, "grad_norm": 0.41334208846092224, "learning_rate": 5.065e-05, "loss": 1.8226, "step": 105 }, { "epoch": 0.059853190287972896, "grad_norm": 0.5378217101097107, "learning_rate": 5.011684210526315e-05, "loss": 1.6466, "step": 106 }, { "epoch": 0.06041784302653868, "grad_norm": 0.5085050463676453, "learning_rate": 4.958368421052631e-05, "loss": 1.9653, "step": 107 }, { "epoch": 0.060982495765104464, "grad_norm": 0.49602341651916504, "learning_rate": 4.9050526315789473e-05, "loss": 1.409, "step": 108 }, { "epoch": 0.06154714850367024, "grad_norm": 0.5390165448188782, "learning_rate": 4.851736842105263e-05, "loss": 1.3066, "step": 109 }, { "epoch": 0.062111801242236024, "grad_norm": 0.5828697085380554, "learning_rate": 4.798421052631579e-05, "loss": 1.8615, "step": 110 }, { "epoch": 0.06267645398080181, "grad_norm": 0.63508141040802, "learning_rate": 4.745105263157895e-05, "loss": 2.0458, "step": 111 }, { "epoch": 0.06324110671936758, "grad_norm": 0.5427883863449097, "learning_rate": 4.69178947368421e-05, "loss": 1.7312, "step": 112 }, { "epoch": 0.06380575945793338, "grad_norm": 0.5502323508262634, "learning_rate": 4.638473684210526e-05, "loss": 2.3985, "step": 113 }, { "epoch": 0.06437041219649915, "grad_norm": 0.4255567491054535, "learning_rate": 4.585157894736842e-05, "loss": 2.0969, "step": 114 }, { "epoch": 0.06493506493506493, "grad_norm": 0.6290948390960693, "learning_rate": 4.531842105263158e-05, "loss": 1.712, "step": 115 }, { "epoch": 0.06549971767363072, "grad_norm": 0.6249313950538635, "learning_rate": 4.478526315789473e-05, "loss": 1.1716, "step": 116 }, { "epoch": 0.0660643704121965, "grad_norm": 0.4874264597892761, "learning_rate": 4.425210526315789e-05, "loss": 1.614, "step": 117 }, { "epoch": 0.06662902315076229, "grad_norm": 0.5078814625740051, "learning_rate": 4.3718947368421054e-05, "loss": 2.0362, "step": 118 }, { "epoch": 0.06719367588932806, "grad_norm": 0.6948334574699402, "learning_rate": 4.318578947368421e-05, "loss": 1.6969, "step": 119 }, { "epoch": 0.06775832862789384, "grad_norm": 0.46054255962371826, "learning_rate": 4.265263157894736e-05, "loss": 1.9061, "step": 120 }, { "epoch": 0.06832298136645963, "grad_norm": 0.4674294590950012, "learning_rate": 4.211947368421052e-05, "loss": 1.4067, "step": 121 }, { "epoch": 0.06888763410502541, "grad_norm": 0.49073663353919983, "learning_rate": 4.1586315789473684e-05, "loss": 2.3371, "step": 122 }, { "epoch": 0.06945228684359118, "grad_norm": 0.49056804180145264, "learning_rate": 4.105315789473684e-05, "loss": 2.1801, "step": 123 }, { "epoch": 0.07001693958215698, "grad_norm": 0.608372688293457, "learning_rate": 4.052e-05, "loss": 2.0307, "step": 124 }, { "epoch": 0.07058159232072275, "grad_norm": 0.46089833974838257, "learning_rate": 3.998684210526316e-05, "loss": 1.9543, "step": 125 }, { "epoch": 0.07114624505928854, "grad_norm": 0.4646291136741638, "learning_rate": 3.945368421052631e-05, "loss": 1.6708, "step": 126 }, { "epoch": 0.07171089779785432, "grad_norm": 0.545235276222229, "learning_rate": 3.892052631578947e-05, "loss": 2.0372, "step": 127 }, { "epoch": 0.0722755505364201, "grad_norm": 0.7160872220993042, "learning_rate": 3.838736842105263e-05, "loss": 2.0102, "step": 128 }, { "epoch": 0.07284020327498589, "grad_norm": 0.5753077864646912, "learning_rate": 3.785421052631579e-05, "loss": 1.7705, "step": 129 }, { "epoch": 0.07340485601355166, "grad_norm": 0.6134698390960693, "learning_rate": 3.732105263157894e-05, "loss": 1.5995, "step": 130 }, { "epoch": 0.07396950875211744, "grad_norm": 0.5752911567687988, "learning_rate": 3.67878947368421e-05, "loss": 1.4896, "step": 131 }, { "epoch": 0.07453416149068323, "grad_norm": 0.7670300602912903, "learning_rate": 3.6254736842105264e-05, "loss": 1.784, "step": 132 }, { "epoch": 0.07509881422924901, "grad_norm": 0.5910040140151978, "learning_rate": 3.572157894736842e-05, "loss": 1.3716, "step": 133 }, { "epoch": 0.0756634669678148, "grad_norm": 0.5962346196174622, "learning_rate": 3.518842105263158e-05, "loss": 1.6385, "step": 134 }, { "epoch": 0.07622811970638058, "grad_norm": 1.3207964897155762, "learning_rate": 3.465526315789473e-05, "loss": 1.2742, "step": 135 }, { "epoch": 0.07679277244494635, "grad_norm": 1.1188924312591553, "learning_rate": 3.4122105263157894e-05, "loss": 1.9659, "step": 136 }, { "epoch": 0.07735742518351214, "grad_norm": 0.6829027533531189, "learning_rate": 3.358894736842105e-05, "loss": 1.1681, "step": 137 }, { "epoch": 0.07792207792207792, "grad_norm": 0.8137729167938232, "learning_rate": 3.305578947368421e-05, "loss": 1.4063, "step": 138 }, { "epoch": 0.07848673066064371, "grad_norm": 0.711093008518219, "learning_rate": 3.252263157894737e-05, "loss": 1.4551, "step": 139 }, { "epoch": 0.07905138339920949, "grad_norm": 1.0196043252944946, "learning_rate": 3.198947368421052e-05, "loss": 1.8365, "step": 140 }, { "epoch": 0.07961603613777526, "grad_norm": 0.8647313714027405, "learning_rate": 3.1456315789473684e-05, "loss": 1.253, "step": 141 }, { "epoch": 0.08018068887634106, "grad_norm": 0.863608717918396, "learning_rate": 3.092315789473684e-05, "loss": 1.2664, "step": 142 }, { "epoch": 0.08074534161490683, "grad_norm": 1.0043619871139526, "learning_rate": 3.039e-05, "loss": 1.5368, "step": 143 }, { "epoch": 0.08130999435347261, "grad_norm": 1.3296562433242798, "learning_rate": 2.9856842105263153e-05, "loss": 0.8377, "step": 144 }, { "epoch": 0.0818746470920384, "grad_norm": 1.5807889699935913, "learning_rate": 2.9323684210526317e-05, "loss": 1.4706, "step": 145 }, { "epoch": 0.08243929983060418, "grad_norm": 1.4404282569885254, "learning_rate": 2.879052631578947e-05, "loss": 1.6419, "step": 146 }, { "epoch": 0.08300395256916997, "grad_norm": 1.8414446115493774, "learning_rate": 2.8257368421052628e-05, "loss": 1.2406, "step": 147 }, { "epoch": 0.08356860530773574, "grad_norm": 6.275533199310303, "learning_rate": 2.772421052631579e-05, "loss": 2.1809, "step": 148 }, { "epoch": 0.08413325804630152, "grad_norm": 3.404439687728882, "learning_rate": 2.7191052631578946e-05, "loss": 1.8676, "step": 149 }, { "epoch": 0.08469791078486731, "grad_norm": 4.620980262756348, "learning_rate": 2.6657894736842104e-05, "loss": 2.8781, "step": 150 }, { "epoch": 0.08469791078486731, "eval_loss": 1.586492657661438, "eval_runtime": 33.7145, "eval_samples_per_second": 22.127, "eval_steps_per_second": 5.547, "step": 150 }, { "epoch": 0.08526256352343309, "grad_norm": 0.3272818326950073, "learning_rate": 2.6124736842105265e-05, "loss": 1.4871, "step": 151 }, { "epoch": 0.08582721626199886, "grad_norm": 0.32894277572631836, "learning_rate": 2.5591578947368422e-05, "loss": 1.7465, "step": 152 }, { "epoch": 0.08639186900056466, "grad_norm": 0.34903210401535034, "learning_rate": 2.5058421052631576e-05, "loss": 2.1659, "step": 153 }, { "epoch": 0.08695652173913043, "grad_norm": 0.309485524892807, "learning_rate": 2.4525263157894737e-05, "loss": 0.9477, "step": 154 }, { "epoch": 0.08752117447769622, "grad_norm": 0.3330695927143097, "learning_rate": 2.3992105263157894e-05, "loss": 1.5259, "step": 155 }, { "epoch": 0.088085827216262, "grad_norm": 0.3170928657054901, "learning_rate": 2.345894736842105e-05, "loss": 2.1571, "step": 156 }, { "epoch": 0.08865047995482778, "grad_norm": 0.44350671768188477, "learning_rate": 2.292578947368421e-05, "loss": 1.7132, "step": 157 }, { "epoch": 0.08921513269339357, "grad_norm": 0.3251064419746399, "learning_rate": 2.2392631578947366e-05, "loss": 1.7776, "step": 158 }, { "epoch": 0.08977978543195934, "grad_norm": 0.4512995183467865, "learning_rate": 2.1859473684210527e-05, "loss": 1.6642, "step": 159 }, { "epoch": 0.09034443817052512, "grad_norm": 0.9298767447471619, "learning_rate": 2.132631578947368e-05, "loss": 1.8871, "step": 160 }, { "epoch": 0.09090909090909091, "grad_norm": 0.39221644401550293, "learning_rate": 2.0793157894736842e-05, "loss": 1.6345, "step": 161 }, { "epoch": 0.09147374364765669, "grad_norm": 0.4242939054965973, "learning_rate": 2.026e-05, "loss": 1.5644, "step": 162 }, { "epoch": 0.09203839638622248, "grad_norm": 0.39541876316070557, "learning_rate": 1.9726842105263157e-05, "loss": 2.0449, "step": 163 }, { "epoch": 0.09260304912478826, "grad_norm": 0.49802130460739136, "learning_rate": 1.9193684210526314e-05, "loss": 1.7162, "step": 164 }, { "epoch": 0.09316770186335403, "grad_norm": 0.38496336340904236, "learning_rate": 1.866052631578947e-05, "loss": 2.2872, "step": 165 }, { "epoch": 0.09373235460191982, "grad_norm": 0.5159594416618347, "learning_rate": 1.8127368421052632e-05, "loss": 1.8203, "step": 166 }, { "epoch": 0.0942970073404856, "grad_norm": 0.7461097836494446, "learning_rate": 1.759421052631579e-05, "loss": 2.0484, "step": 167 }, { "epoch": 0.09486166007905138, "grad_norm": 0.4127659797668457, "learning_rate": 1.7061052631578947e-05, "loss": 1.8052, "step": 168 }, { "epoch": 0.09542631281761717, "grad_norm": 0.4347990155220032, "learning_rate": 1.6527894736842104e-05, "loss": 1.7951, "step": 169 }, { "epoch": 0.09599096555618294, "grad_norm": 0.4800684154033661, "learning_rate": 1.599473684210526e-05, "loss": 1.5261, "step": 170 }, { "epoch": 0.09655561829474874, "grad_norm": 0.3702009320259094, "learning_rate": 1.546157894736842e-05, "loss": 2.0826, "step": 171 }, { "epoch": 0.09712027103331451, "grad_norm": 0.6166195273399353, "learning_rate": 1.4928421052631576e-05, "loss": 1.437, "step": 172 }, { "epoch": 0.09768492377188029, "grad_norm": 0.5393872857093811, "learning_rate": 1.4395263157894735e-05, "loss": 2.0329, "step": 173 }, { "epoch": 0.09824957651044608, "grad_norm": 0.3947698771953583, "learning_rate": 1.3862105263157895e-05, "loss": 1.9888, "step": 174 }, { "epoch": 0.09881422924901186, "grad_norm": 0.5108495354652405, "learning_rate": 1.3328947368421052e-05, "loss": 1.6187, "step": 175 }, { "epoch": 0.09937888198757763, "grad_norm": 0.42177772521972656, "learning_rate": 1.2795789473684211e-05, "loss": 1.752, "step": 176 }, { "epoch": 0.09994353472614342, "grad_norm": 0.4926985800266266, "learning_rate": 1.2262631578947368e-05, "loss": 2.1683, "step": 177 }, { "epoch": 0.1005081874647092, "grad_norm": 0.46454355120658875, "learning_rate": 1.1729473684210526e-05, "loss": 1.4214, "step": 178 }, { "epoch": 0.10107284020327499, "grad_norm": 0.4612448811531067, "learning_rate": 1.1196315789473683e-05, "loss": 1.7107, "step": 179 }, { "epoch": 0.10163749294184077, "grad_norm": 0.5232647657394409, "learning_rate": 1.066315789473684e-05, "loss": 2.0586, "step": 180 }, { "epoch": 0.10220214568040654, "grad_norm": 0.5285339951515198, "learning_rate": 1.013e-05, "loss": 1.5791, "step": 181 }, { "epoch": 0.10276679841897234, "grad_norm": 0.46414557099342346, "learning_rate": 9.596842105263157e-06, "loss": 1.7391, "step": 182 }, { "epoch": 0.10333145115753811, "grad_norm": 0.5057702660560608, "learning_rate": 9.063684210526316e-06, "loss": 1.7127, "step": 183 }, { "epoch": 0.1038961038961039, "grad_norm": 0.5308021903038025, "learning_rate": 8.530526315789473e-06, "loss": 1.691, "step": 184 }, { "epoch": 0.10446075663466968, "grad_norm": 0.6663089394569397, "learning_rate": 7.99736842105263e-06, "loss": 1.403, "step": 185 }, { "epoch": 0.10502540937323546, "grad_norm": 0.601261556148529, "learning_rate": 7.464210526315788e-06, "loss": 1.5466, "step": 186 }, { "epoch": 0.10559006211180125, "grad_norm": 0.563347578048706, "learning_rate": 6.931052631578947e-06, "loss": 1.389, "step": 187 }, { "epoch": 0.10615471485036702, "grad_norm": 0.6577058434486389, "learning_rate": 6.3978947368421055e-06, "loss": 2.0848, "step": 188 }, { "epoch": 0.1067193675889328, "grad_norm": 1.0290292501449585, "learning_rate": 5.864736842105263e-06, "loss": 1.715, "step": 189 }, { "epoch": 0.10728402032749859, "grad_norm": 1.501333475112915, "learning_rate": 5.33157894736842e-06, "loss": 1.8911, "step": 190 }, { "epoch": 0.10784867306606437, "grad_norm": 0.7974535226821899, "learning_rate": 4.7984210526315785e-06, "loss": 1.6194, "step": 191 }, { "epoch": 0.10841332580463016, "grad_norm": 0.6893588304519653, "learning_rate": 4.265263157894737e-06, "loss": 2.0794, "step": 192 }, { "epoch": 0.10897797854319594, "grad_norm": 0.7431008815765381, "learning_rate": 3.732105263157894e-06, "loss": 1.2137, "step": 193 }, { "epoch": 0.10954263128176171, "grad_norm": 0.8880150318145752, "learning_rate": 3.1989473684210527e-06, "loss": 1.3514, "step": 194 }, { "epoch": 0.1101072840203275, "grad_norm": 1.2837520837783813, "learning_rate": 2.66578947368421e-06, "loss": 1.6779, "step": 195 }, { "epoch": 0.11067193675889328, "grad_norm": 1.7506012916564941, "learning_rate": 2.1326315789473684e-06, "loss": 0.9495, "step": 196 }, { "epoch": 0.11123658949745906, "grad_norm": 1.3716044425964355, "learning_rate": 1.5994736842105264e-06, "loss": 1.1218, "step": 197 }, { "epoch": 0.11180124223602485, "grad_norm": 1.528428077697754, "learning_rate": 1.0663157894736842e-06, "loss": 0.9545, "step": 198 }, { "epoch": 0.11236589497459062, "grad_norm": 1.5911756753921509, "learning_rate": 5.331578947368421e-07, "loss": 1.27, "step": 199 }, { "epoch": 0.11293054771315642, "grad_norm": 2.935549259185791, "learning_rate": 0.0, "loss": 2.7936, "step": 200 }, { "epoch": 0.11293054771315642, "eval_loss": 1.5823931694030762, "eval_runtime": 33.5466, "eval_samples_per_second": 22.238, "eval_steps_per_second": 5.574, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.675201553104896e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }