{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 137, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.9848, "step": 1 }, { "epoch": 0.01, "grad_norm": 1.953125, "learning_rate": 4e-05, "loss": 0.9509, "step": 2 }, { "epoch": 0.02, "grad_norm": 2.53125, "learning_rate": 6e-05, "loss": 1.0676, "step": 3 }, { "epoch": 0.03, "grad_norm": 1.953125, "learning_rate": 8e-05, "loss": 0.823, "step": 4 }, { "epoch": 0.04, "grad_norm": 2.515625, "learning_rate": 0.0001, "loss": 0.6513, "step": 5 }, { "epoch": 0.04, "grad_norm": 1.2421875, "learning_rate": 0.00012, "loss": 0.2973, "step": 6 }, { "epoch": 0.05, "grad_norm": 0.51171875, "learning_rate": 0.00014, "loss": 0.1631, "step": 7 }, { "epoch": 0.06, "grad_norm": 0.396484375, "learning_rate": 0.00016, "loss": 0.1212, "step": 8 }, { "epoch": 0.07, "grad_norm": 2.390625, "learning_rate": 0.00018, "loss": 0.2939, "step": 9 }, { "epoch": 0.07, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.2165, "step": 10 }, { "epoch": 0.08, "grad_norm": 0.8671875, "learning_rate": 0.0001999694057253083, "loss": 0.2527, "step": 11 }, { "epoch": 0.09, "grad_norm": 0.1923828125, "learning_rate": 0.00019987764162142613, "loss": 0.1284, "step": 12 }, { "epoch": 0.09, "grad_norm": 0.255859375, "learning_rate": 0.00019972476383747748, "loss": 0.1244, "step": 13 }, { "epoch": 0.1, "grad_norm": 0.09521484375, "learning_rate": 0.0001995108659171607, "loss": 0.1175, "step": 14 }, { "epoch": 0.11, "grad_norm": 0.27734375, "learning_rate": 0.00019923607874151032, "loss": 0.1832, "step": 15 }, { "epoch": 0.12, "grad_norm": 0.09228515625, "learning_rate": 0.00019890057044881306, "loss": 0.1227, "step": 16 }, { "epoch": 0.12, "grad_norm": 0.115234375, "learning_rate": 0.00019850454633172631, "loss": 0.1211, "step": 17 }, { "epoch": 0.13, "grad_norm": 0.064453125, "learning_rate": 0.00019804824871166255, "loss": 0.1122, "step": 18 }, { "epoch": 0.14, "grad_norm": 0.10400390625, "learning_rate": 0.00019753195679051628, "loss": 0.1077, "step": 19 }, { "epoch": 0.15, "grad_norm": 0.051025390625, "learning_rate": 0.00019695598647982468, "loss": 0.104, "step": 20 }, { "epoch": 0.15, "grad_norm": 0.09326171875, "learning_rate": 0.00019632069020746572, "loss": 0.1143, "step": 21 }, { "epoch": 0.16, "grad_norm": 0.125, "learning_rate": 0.00019562645670201276, "loss": 0.1172, "step": 22 }, { "epoch": 0.17, "grad_norm": 0.130859375, "learning_rate": 0.00019487371075487713, "loss": 0.1083, "step": 23 }, { "epoch": 0.18, "grad_norm": 0.08349609375, "learning_rate": 0.0001940629129603844, "loss": 0.1051, "step": 24 }, { "epoch": 0.18, "grad_norm": 0.146484375, "learning_rate": 0.00019319455943394347, "loss": 0.1076, "step": 25 }, { "epoch": 0.19, "grad_norm": 0.09423828125, "learning_rate": 0.00019226918150848068, "loss": 0.0868, "step": 26 }, { "epoch": 0.2, "grad_norm": 0.275390625, "learning_rate": 0.00019128734540932495, "loss": 0.1223, "step": 27 }, { "epoch": 0.2, "grad_norm": 0.318359375, "learning_rate": 0.00019024965190774263, "loss": 0.1346, "step": 28 }, { "epoch": 0.21, "grad_norm": 0.46484375, "learning_rate": 0.00018915673595333444, "loss": 0.1361, "step": 29 }, { "epoch": 0.22, "grad_norm": 0.10498046875, "learning_rate": 0.00018800926628551886, "loss": 0.1178, "step": 30 }, { "epoch": 0.23, "grad_norm": 0.1044921875, "learning_rate": 0.00018680794502434018, "loss": 0.1085, "step": 31 }, { "epoch": 0.23, "grad_norm": 0.0634765625, "learning_rate": 0.00018555350724085162, "loss": 0.0985, "step": 32 }, { "epoch": 0.24, "grad_norm": 0.2119140625, "learning_rate": 0.00018424672050733576, "loss": 0.1486, "step": 33 }, { "epoch": 0.25, "grad_norm": 0.158203125, "learning_rate": 0.00018288838442763838, "loss": 0.1285, "step": 34 }, { "epoch": 0.26, "grad_norm": 0.099609375, "learning_rate": 0.00018147933014790244, "loss": 0.1147, "step": 35 }, { "epoch": 0.26, "grad_norm": 0.08056640625, "learning_rate": 0.00018002041984800174, "loss": 0.0984, "step": 36 }, { "epoch": 0.27, "grad_norm": 0.07763671875, "learning_rate": 0.0001785125462139855, "loss": 0.1147, "step": 37 }, { "epoch": 0.28, "grad_norm": 0.062255859375, "learning_rate": 0.000176956631891857, "loss": 0.1016, "step": 38 }, { "epoch": 0.28, "grad_norm": 0.053466796875, "learning_rate": 0.00017535362892301954, "loss": 0.1013, "step": 39 }, { "epoch": 0.29, "grad_norm": 0.0673828125, "learning_rate": 0.0001737045181617364, "loss": 0.1001, "step": 40 }, { "epoch": 0.3, "grad_norm": 0.0908203125, "learning_rate": 0.00017201030867496005, "loss": 0.12, "step": 41 }, { "epoch": 0.31, "grad_norm": 0.12060546875, "learning_rate": 0.000170272037124899, "loss": 0.1234, "step": 42 }, { "epoch": 0.31, "grad_norm": 0.0703125, "learning_rate": 0.00016849076713469914, "loss": 0.1168, "step": 43 }, { "epoch": 0.32, "grad_norm": 0.07177734375, "learning_rate": 0.00016666758863762793, "loss": 0.094, "step": 44 }, { "epoch": 0.33, "grad_norm": 0.038818359375, "learning_rate": 0.00016480361721016054, "loss": 0.1026, "step": 45 }, { "epoch": 0.34, "grad_norm": 0.080078125, "learning_rate": 0.00016289999338937427, "loss": 0.0922, "step": 46 }, { "epoch": 0.34, "grad_norm": 0.072265625, "learning_rate": 0.00016095788197507081, "loss": 0.1234, "step": 47 }, { "epoch": 0.35, "grad_norm": 0.07080078125, "learning_rate": 0.00015897847131705195, "loss": 0.1014, "step": 48 }, { "epoch": 0.36, "grad_norm": 0.10205078125, "learning_rate": 0.0001569629725879857, "loss": 0.0768, "step": 49 }, { "epoch": 0.36, "grad_norm": 0.166015625, "learning_rate": 0.00015491261904230727, "loss": 0.0872, "step": 50 }, { "epoch": 0.37, "grad_norm": 0.1796875, "learning_rate": 0.00015282866526160837, "loss": 0.1247, "step": 51 }, { "epoch": 0.38, "grad_norm": 0.07958984375, "learning_rate": 0.00015071238638697732, "loss": 0.071, "step": 52 }, { "epoch": 0.39, "grad_norm": 0.1943359375, "learning_rate": 0.00014856507733875836, "loss": 0.1064, "step": 53 }, { "epoch": 0.39, "grad_norm": 0.23046875, "learning_rate": 0.00014638805202420895, "loss": 0.0965, "step": 54 }, { "epoch": 0.4, "grad_norm": 0.17578125, "learning_rate": 0.0001441826425335387, "loss": 0.0919, "step": 55 }, { "epoch": 0.41, "grad_norm": 0.1611328125, "learning_rate": 0.0001419501983248229, "loss": 0.0636, "step": 56 }, { "epoch": 0.42, "grad_norm": 0.400390625, "learning_rate": 0.00013969208539828872, "loss": 0.0864, "step": 57 }, { "epoch": 0.42, "grad_norm": 0.130859375, "learning_rate": 0.00013740968546047935, "loss": 0.0747, "step": 58 }, { "epoch": 0.43, "grad_norm": 0.373046875, "learning_rate": 0.00013510439507880776, "loss": 0.1053, "step": 59 }, { "epoch": 0.44, "grad_norm": 0.283203125, "learning_rate": 0.00013277762482701767, "loss": 0.0828, "step": 60 }, { "epoch": 0.45, "grad_norm": 0.318359375, "learning_rate": 0.0001304307984220736, "loss": 0.0796, "step": 61 }, { "epoch": 0.45, "grad_norm": 0.328125, "learning_rate": 0.0001280653518530093, "loss": 0.0879, "step": 62 }, { "epoch": 0.46, "grad_norm": 0.1474609375, "learning_rate": 0.0001256827325022668, "loss": 0.1077, "step": 63 }, { "epoch": 0.47, "grad_norm": 0.2734375, "learning_rate": 0.00012328439826006415, "loss": 0.0839, "step": 64 }, { "epoch": 0.47, "grad_norm": 0.134765625, "learning_rate": 0.00012087181663233354, "loss": 0.0732, "step": 65 }, { "epoch": 0.48, "grad_norm": 0.322265625, "learning_rate": 0.0001184464638427756, "loss": 0.0832, "step": 66 }, { "epoch": 0.49, "grad_norm": 0.6484375, "learning_rate": 0.00011600982392957978, "loss": 0.0903, "step": 67 }, { "epoch": 0.5, "grad_norm": 0.36328125, "learning_rate": 0.00011356338783736255, "loss": 0.0936, "step": 68 }, { "epoch": 0.5, "grad_norm": 0.0732421875, "learning_rate": 0.00011110865250488047, "loss": 0.0622, "step": 69 }, { "epoch": 0.51, "grad_norm": 0.28125, "learning_rate": 0.00010864711994907458, "loss": 0.0873, "step": 70 }, { "epoch": 0.52, "grad_norm": 0.1494140625, "learning_rate": 0.00010618029634600843, "loss": 0.1168, "step": 71 }, { "epoch": 0.53, "grad_norm": 0.185546875, "learning_rate": 0.00010370969110926052, "loss": 0.0881, "step": 72 }, { "epoch": 0.53, "grad_norm": 0.27734375, "learning_rate": 0.00010123681596633629, "loss": 0.1136, "step": 73 }, { "epoch": 0.54, "grad_norm": 0.1884765625, "learning_rate": 9.876318403366372e-05, "loss": 0.0861, "step": 74 }, { "epoch": 0.55, "grad_norm": 0.080078125, "learning_rate": 9.629030889073949e-05, "loss": 0.0819, "step": 75 }, { "epoch": 0.55, "grad_norm": 0.1318359375, "learning_rate": 9.38197036539916e-05, "loss": 0.0784, "step": 76 }, { "epoch": 0.56, "grad_norm": 0.240234375, "learning_rate": 9.135288005092546e-05, "loss": 0.0631, "step": 77 }, { "epoch": 0.57, "grad_norm": 0.1826171875, "learning_rate": 8.889134749511955e-05, "loss": 0.1073, "step": 78 }, { "epoch": 0.58, "grad_norm": 0.1953125, "learning_rate": 8.643661216263743e-05, "loss": 0.0909, "step": 79 }, { "epoch": 0.58, "grad_norm": 0.298828125, "learning_rate": 8.399017607042025e-05, "loss": 0.1, "step": 80 }, { "epoch": 0.59, "grad_norm": 0.0703125, "learning_rate": 8.155353615722442e-05, "loss": 0.0581, "step": 81 }, { "epoch": 0.6, "grad_norm": 0.06689453125, "learning_rate": 7.91281833676665e-05, "loss": 0.0695, "step": 82 }, { "epoch": 0.61, "grad_norm": 0.1572265625, "learning_rate": 7.671560173993587e-05, "loss": 0.1054, "step": 83 }, { "epoch": 0.61, "grad_norm": 0.052734375, "learning_rate": 7.431726749773322e-05, "loss": 0.0946, "step": 84 }, { "epoch": 0.62, "grad_norm": 0.08154296875, "learning_rate": 7.193464814699073e-05, "loss": 0.0561, "step": 85 }, { "epoch": 0.63, "grad_norm": 0.181640625, "learning_rate": 6.956920157792639e-05, "loss": 0.0902, "step": 86 }, { "epoch": 0.64, "grad_norm": 0.0908203125, "learning_rate": 6.722237517298232e-05, "loss": 0.0694, "step": 87 }, { "epoch": 0.64, "grad_norm": 0.11767578125, "learning_rate": 6.489560492119225e-05, "loss": 0.1068, "step": 88 }, { "epoch": 0.65, "grad_norm": 0.125, "learning_rate": 6.259031453952069e-05, "loss": 0.0671, "step": 89 }, { "epoch": 0.66, "grad_norm": 0.0732421875, "learning_rate": 6.0307914601711305e-05, "loss": 0.0891, "step": 90 }, { "epoch": 0.66, "grad_norm": 0.08056640625, "learning_rate": 5.8049801675177115e-05, "loss": 0.0782, "step": 91 }, { "epoch": 0.67, "grad_norm": 0.2060546875, "learning_rate": 5.5817357466461336e-05, "loss": 0.1028, "step": 92 }, { "epoch": 0.68, "grad_norm": 0.125, "learning_rate": 5.361194797579108e-05, "loss": 0.0581, "step": 93 }, { "epoch": 0.69, "grad_norm": 0.08935546875, "learning_rate": 5.1434922661241635e-05, "loss": 0.0619, "step": 94 }, { "epoch": 0.69, "grad_norm": 0.07373046875, "learning_rate": 4.928761361302269e-05, "loss": 0.0529, "step": 95 }, { "epoch": 0.7, "grad_norm": 0.0703125, "learning_rate": 4.717133473839163e-05, "loss": 0.0397, "step": 96 }, { "epoch": 0.71, "grad_norm": 0.11767578125, "learning_rate": 4.5087380957692784e-05, "loss": 0.0459, "step": 97 }, { "epoch": 0.72, "grad_norm": 0.203125, "learning_rate": 4.303702741201431e-05, "loss": 0.0752, "step": 98 }, { "epoch": 0.72, "grad_norm": 0.15625, "learning_rate": 4.1021528682948066e-05, "loss": 0.0988, "step": 99 }, { "epoch": 0.73, "grad_norm": 0.09814453125, "learning_rate": 3.904211802492922e-05, "loss": 0.0739, "step": 100 }, { "epoch": 0.74, "grad_norm": 0.1298828125, "learning_rate": 3.7100006610625784e-05, "loss": 0.0634, "step": 101 }, { "epoch": 0.74, "grad_norm": 0.1181640625, "learning_rate": 3.519638278983948e-05, "loss": 0.057, "step": 102 }, { "epoch": 0.75, "grad_norm": 0.1123046875, "learning_rate": 3.333241136237206e-05, "loss": 0.0556, "step": 103 }, { "epoch": 0.76, "grad_norm": 0.32421875, "learning_rate": 3.150923286530089e-05, "loss": 0.1233, "step": 104 }, { "epoch": 0.77, "grad_norm": 0.193359375, "learning_rate": 2.9727962875101e-05, "loss": 0.1074, "step": 105 }, { "epoch": 0.77, "grad_norm": 0.1484375, "learning_rate": 2.798969132503997e-05, "loss": 0.0993, "step": 106 }, { "epoch": 0.78, "grad_norm": 0.0771484375, "learning_rate": 2.6295481838263626e-05, "loss": 0.0537, "step": 107 }, { "epoch": 0.79, "grad_norm": 0.0869140625, "learning_rate": 2.4646371076980457e-05, "loss": 0.0704, "step": 108 }, { "epoch": 0.8, "grad_norm": 0.10205078125, "learning_rate": 2.3043368108143047e-05, "loss": 0.0825, "step": 109 }, { "epoch": 0.8, "grad_norm": 0.09912109375, "learning_rate": 2.1487453786014512e-05, "loss": 0.0422, "step": 110 }, { "epoch": 0.81, "grad_norm": 0.1123046875, "learning_rate": 1.997958015199829e-05, "loss": 0.0726, "step": 111 }, { "epoch": 0.82, "grad_norm": 0.1982421875, "learning_rate": 1.8520669852097573e-05, "loss": 0.0518, "step": 112 }, { "epoch": 0.82, "grad_norm": 0.091796875, "learning_rate": 1.7111615572361628e-05, "loss": 0.0884, "step": 113 }, { "epoch": 0.83, "grad_norm": 0.1181640625, "learning_rate": 1.5753279492664262e-05, "loss": 0.0892, "step": 114 }, { "epoch": 0.84, "grad_norm": 0.07080078125, "learning_rate": 1.4446492759148411e-05, "loss": 0.0478, "step": 115 }, { "epoch": 0.85, "grad_norm": 0.087890625, "learning_rate": 1.319205497565983e-05, "loss": 0.0786, "step": 116 }, { "epoch": 0.85, "grad_norm": 0.134765625, "learning_rate": 1.1990733714481184e-05, "loss": 0.0429, "step": 117 }, { "epoch": 0.86, "grad_norm": 0.083984375, "learning_rate": 1.0843264046665557e-05, "loss": 0.0455, "step": 118 }, { "epoch": 0.87, "grad_norm": 0.1982421875, "learning_rate": 9.750348092257367e-06, "loss": 0.0896, "step": 119 }, { "epoch": 0.88, "grad_norm": 0.09716796875, "learning_rate": 8.712654590675085e-06, "loss": 0.0727, "step": 120 }, { "epoch": 0.88, "grad_norm": 0.0751953125, "learning_rate": 7.730818491519343e-06, "loss": 0.0284, "step": 121 }, { "epoch": 0.89, "grad_norm": 0.11181640625, "learning_rate": 6.805440566056553e-06, "loss": 0.0484, "step": 122 }, { "epoch": 0.9, "grad_norm": 0.130859375, "learning_rate": 5.937087039615619e-06, "loss": 0.0573, "step": 123 }, { "epoch": 0.91, "grad_norm": 0.1142578125, "learning_rate": 5.126289245122906e-06, "loss": 0.071, "step": 124 }, { "epoch": 0.91, "grad_norm": 0.09375, "learning_rate": 4.37354329798726e-06, "loss": 0.0695, "step": 125 }, { "epoch": 0.92, "grad_norm": 0.09375, "learning_rate": 3.679309792534291e-06, "loss": 0.0453, "step": 126 }, { "epoch": 0.93, "grad_norm": 0.1298828125, "learning_rate": 3.0440135201753374e-06, "loss": 0.0906, "step": 127 }, { "epoch": 0.93, "grad_norm": 0.09912109375, "learning_rate": 2.468043209483739e-06, "loss": 0.0617, "step": 128 }, { "epoch": 0.94, "grad_norm": 0.142578125, "learning_rate": 1.951751288337467e-06, "loss": 0.0667, "step": 129 }, { "epoch": 0.95, "grad_norm": 0.13671875, "learning_rate": 1.4954536682736719e-06, "loss": 0.0686, "step": 130 }, { "epoch": 0.96, "grad_norm": 0.103515625, "learning_rate": 1.0994295511869257e-06, "loss": 0.0415, "step": 131 }, { "epoch": 0.96, "grad_norm": 0.12255859375, "learning_rate": 7.639212584897082e-07, "loss": 0.0573, "step": 132 }, { "epoch": 0.97, "grad_norm": 0.1689453125, "learning_rate": 4.891340828393487e-07, "loss": 0.0876, "step": 133 }, { "epoch": 0.98, "grad_norm": 0.0712890625, "learning_rate": 2.752361625225297e-07, "loss": 0.0599, "step": 134 }, { "epoch": 0.99, "grad_norm": 0.072265625, "learning_rate": 1.2235837857387246e-07, "loss": 0.0593, "step": 135 }, { "epoch": 0.99, "grad_norm": 0.123046875, "learning_rate": 3.059427469168652e-08, "loss": 0.101, "step": 136 }, { "epoch": 1.0, "grad_norm": 0.146484375, "learning_rate": 0.0, "loss": 0.0724, "step": 137 }, { "epoch": 1.0, "eval_loss": 0.05070766434073448, "eval_runtime": 6.5801, "eval_samples_per_second": 17.477, "eval_steps_per_second": 1.216, "step": 137 } ], "logging_steps": 1, "max_steps": 137, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 9.602610483743949e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }