{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 2.4236936569213867, "learning_rate": 0.0001, "loss": 1.1597, "step": 1 }, { "epoch": 2.0, "grad_norm": 2.4236936569213867, "learning_rate": 0.0002, "loss": 1.1597, "step": 2 }, { "epoch": 3.0, "grad_norm": 1.3299813270568848, "learning_rate": 0.00019795918367346938, "loss": 0.9809, "step": 3 }, { "epoch": 4.0, "grad_norm": 1.62615966796875, "learning_rate": 0.0001959183673469388, "loss": 0.7416, "step": 4 }, { "epoch": 5.0, "grad_norm": 1.3189771175384521, "learning_rate": 0.00019387755102040816, "loss": 0.5219, "step": 5 }, { "epoch": 6.0, "grad_norm": 1.062267780303955, "learning_rate": 0.00019183673469387756, "loss": 0.3376, "step": 6 }, { "epoch": 7.0, "grad_norm": 0.7606938481330872, "learning_rate": 0.00018979591836734697, "loss": 0.1999, "step": 7 }, { "epoch": 8.0, "grad_norm": 0.6657623648643494, "learning_rate": 0.00018775510204081634, "loss": 0.1314, "step": 8 }, { "epoch": 9.0, "grad_norm": 0.4108690917491913, "learning_rate": 0.00018571428571428572, "loss": 0.087, "step": 9 }, { "epoch": 10.0, "grad_norm": 0.25315290689468384, "learning_rate": 0.00018367346938775512, "loss": 0.0777, "step": 10 }, { "epoch": 11.0, "grad_norm": 4.897407054901123, "learning_rate": 0.0001816326530612245, "loss": 0.087, "step": 11 }, { "epoch": 12.0, "grad_norm": 0.3001738488674164, "learning_rate": 0.0001795918367346939, "loss": 0.0707, "step": 12 }, { "epoch": 13.0, "grad_norm": 5.1508378982543945, "learning_rate": 0.00017755102040816327, "loss": 0.088, "step": 13 }, { "epoch": 14.0, "grad_norm": 9.67592716217041, "learning_rate": 0.00017551020408163265, "loss": 0.0799, "step": 14 }, { "epoch": 15.0, "grad_norm": 2.7876245975494385, "learning_rate": 0.00017346938775510205, "loss": 0.0967, "step": 15 }, { "epoch": 16.0, "grad_norm": 1.8201392889022827, "learning_rate": 0.00017142857142857143, "loss": 0.0601, "step": 16 }, { "epoch": 17.0, "grad_norm": 1.6338967084884644, "learning_rate": 0.00016938775510204083, "loss": 0.0512, "step": 17 }, { "epoch": 18.0, "grad_norm": 0.4770618975162506, "learning_rate": 0.00016734693877551023, "loss": 0.0373, "step": 18 }, { "epoch": 19.0, "grad_norm": 0.458904504776001, "learning_rate": 0.0001653061224489796, "loss": 0.0279, "step": 19 }, { "epoch": 20.0, "grad_norm": 1.6127618551254272, "learning_rate": 0.00016326530612244898, "loss": 0.023, "step": 20 }, { "epoch": 21.0, "grad_norm": 1.090635895729065, "learning_rate": 0.00016122448979591838, "loss": 0.0105, "step": 21 }, { "epoch": 22.0, "grad_norm": 0.08737627416849136, "learning_rate": 0.00015918367346938776, "loss": 0.0061, "step": 22 }, { "epoch": 23.0, "grad_norm": 0.08030153065919876, "learning_rate": 0.00015714285714285716, "loss": 0.0051, "step": 23 }, { "epoch": 24.0, "grad_norm": 0.04834992438554764, "learning_rate": 0.00015510204081632654, "loss": 0.0048, "step": 24 }, { "epoch": 25.0, "grad_norm": 0.09552427381277084, "learning_rate": 0.0001530612244897959, "loss": 0.0048, "step": 25 }, { "epoch": 26.0, "grad_norm": 0.0460357591509819, "learning_rate": 0.0001510204081632653, "loss": 0.0046, "step": 26 }, { "epoch": 27.0, "grad_norm": 0.0992983803153038, "learning_rate": 0.00014897959183673472, "loss": 0.0047, "step": 27 }, { "epoch": 28.0, "grad_norm": 0.1185307577252388, "learning_rate": 0.0001469387755102041, "loss": 0.0048, "step": 28 }, { "epoch": 29.0, "grad_norm": 0.14356577396392822, "learning_rate": 0.0001448979591836735, "loss": 0.0049, "step": 29 }, { "epoch": 30.0, "grad_norm": 0.0876593366265297, "learning_rate": 0.00014285714285714287, "loss": 0.0046, "step": 30 }, { "epoch": 31.0, "grad_norm": 0.08368655294179916, "learning_rate": 0.00014081632653061224, "loss": 0.0047, "step": 31 }, { "epoch": 32.0, "grad_norm": 0.042614325881004333, "learning_rate": 0.00013877551020408165, "loss": 0.0045, "step": 32 }, { "epoch": 33.0, "grad_norm": 0.00863147247582674, "learning_rate": 0.00013673469387755102, "loss": 0.0044, "step": 33 }, { "epoch": 34.0, "grad_norm": 0.0089035639539361, "learning_rate": 0.0001346938775510204, "loss": 0.0044, "step": 34 }, { "epoch": 35.0, "grad_norm": 0.03151445463299751, "learning_rate": 0.0001326530612244898, "loss": 0.0044, "step": 35 }, { "epoch": 36.0, "grad_norm": 0.0310219656676054, "learning_rate": 0.00013061224489795917, "loss": 0.0044, "step": 36 }, { "epoch": 37.0, "grad_norm": 0.030537990853190422, "learning_rate": 0.00012857142857142858, "loss": 0.0044, "step": 37 }, { "epoch": 38.0, "grad_norm": 0.03915323689579964, "learning_rate": 0.00012653061224489798, "loss": 0.0044, "step": 38 }, { "epoch": 39.0, "grad_norm": 0.038823168724775314, "learning_rate": 0.00012448979591836735, "loss": 0.0044, "step": 39 }, { "epoch": 40.0, "grad_norm": 0.005813094321638346, "learning_rate": 0.00012244897959183676, "loss": 0.0044, "step": 40 }, { "epoch": 41.0, "grad_norm": 0.02931728959083557, "learning_rate": 0.00012040816326530613, "loss": 0.0044, "step": 41 }, { "epoch": 42.0, "grad_norm": 0.02919776178896427, "learning_rate": 0.00011836734693877552, "loss": 0.0044, "step": 42 }, { "epoch": 43.0, "grad_norm": 0.005404920782893896, "learning_rate": 0.0001163265306122449, "loss": 0.0044, "step": 43 }, { "epoch": 44.0, "grad_norm": 0.03844524547457695, "learning_rate": 0.00011428571428571428, "loss": 0.0044, "step": 44 }, { "epoch": 45.0, "grad_norm": 0.005866413004696369, "learning_rate": 0.00011224489795918367, "loss": 0.0043, "step": 45 }, { "epoch": 46.0, "grad_norm": 0.028769973665475845, "learning_rate": 0.00011020408163265306, "loss": 0.0044, "step": 46 }, { "epoch": 47.0, "grad_norm": 0.028779389336705208, "learning_rate": 0.00010816326530612246, "loss": 0.0044, "step": 47 }, { "epoch": 48.0, "grad_norm": 0.006406720262020826, "learning_rate": 0.00010612244897959185, "loss": 0.0043, "step": 48 }, { "epoch": 49.0, "grad_norm": 0.03870103508234024, "learning_rate": 0.00010408163265306123, "loss": 0.0044, "step": 49 }, { "epoch": 50.0, "grad_norm": 0.038732465356588364, "learning_rate": 0.00010204081632653062, "loss": 0.0044, "step": 50 }, { "epoch": 51.0, "grad_norm": 0.00536749605089426, "learning_rate": 0.0001, "loss": 0.0043, "step": 51 }, { "epoch": 52.0, "grad_norm": 0.028795704245567322, "learning_rate": 9.79591836734694e-05, "loss": 0.0044, "step": 52 }, { "epoch": 53.0, "grad_norm": 0.02882838249206543, "learning_rate": 9.591836734693878e-05, "loss": 0.0044, "step": 53 }, { "epoch": 54.0, "grad_norm": 0.02882484346628189, "learning_rate": 9.387755102040817e-05, "loss": 0.0044, "step": 54 }, { "epoch": 55.0, "grad_norm": 0.00502988463267684, "learning_rate": 9.183673469387756e-05, "loss": 0.0043, "step": 55 }, { "epoch": 56.0, "grad_norm": 0.005017813295125961, "learning_rate": 8.979591836734695e-05, "loss": 0.0043, "step": 56 }, { "epoch": 57.0, "grad_norm": 0.005218982230871916, "learning_rate": 8.775510204081632e-05, "loss": 0.0043, "step": 57 }, { "epoch": 58.0, "grad_norm": 0.038150474429130554, "learning_rate": 8.571428571428571e-05, "loss": 0.0044, "step": 58 }, { "epoch": 59.0, "grad_norm": 0.03801121190190315, "learning_rate": 8.367346938775511e-05, "loss": 0.0044, "step": 59 }, { "epoch": 60.0, "grad_norm": 0.004977047443389893, "learning_rate": 8.163265306122449e-05, "loss": 0.0043, "step": 60 }, { "epoch": 61.0, "grad_norm": 0.0602005235850811, "learning_rate": 7.959183673469388e-05, "loss": 0.0045, "step": 61 }, { "epoch": 62.0, "grad_norm": 0.0284713264554739, "learning_rate": 7.755102040816327e-05, "loss": 0.0044, "step": 62 }, { "epoch": 63.0, "grad_norm": 0.004900462459772825, "learning_rate": 7.551020408163266e-05, "loss": 0.0043, "step": 63 }, { "epoch": 64.0, "grad_norm": 0.0048695337027311325, "learning_rate": 7.346938775510205e-05, "loss": 0.0043, "step": 64 }, { "epoch": 65.0, "grad_norm": 0.004841310903429985, "learning_rate": 7.142857142857143e-05, "loss": 0.0043, "step": 65 }, { "epoch": 66.0, "grad_norm": 0.004835947882384062, "learning_rate": 6.938775510204082e-05, "loss": 0.0043, "step": 66 }, { "epoch": 67.0, "grad_norm": 0.03715803846716881, "learning_rate": 6.73469387755102e-05, "loss": 0.0044, "step": 67 }, { "epoch": 68.0, "grad_norm": 0.004820631351321936, "learning_rate": 6.530612244897959e-05, "loss": 0.0043, "step": 68 }, { "epoch": 69.0, "grad_norm": 0.004830862861126661, "learning_rate": 6.326530612244899e-05, "loss": 0.0043, "step": 69 }, { "epoch": 70.0, "grad_norm": 0.02773592434823513, "learning_rate": 6.122448979591838e-05, "loss": 0.0044, "step": 70 }, { "epoch": 71.0, "grad_norm": 0.02766617201268673, "learning_rate": 5.918367346938776e-05, "loss": 0.0044, "step": 71 }, { "epoch": 72.0, "grad_norm": 0.027592068538069725, "learning_rate": 5.714285714285714e-05, "loss": 0.0044, "step": 72 }, { "epoch": 73.0, "grad_norm": 0.004762730095535517, "learning_rate": 5.510204081632653e-05, "loss": 0.0043, "step": 73 }, { "epoch": 74.0, "grad_norm": 0.004771250765770674, "learning_rate": 5.3061224489795926e-05, "loss": 0.0043, "step": 74 }, { "epoch": 75.0, "grad_norm": 0.004748388193547726, "learning_rate": 5.102040816326531e-05, "loss": 0.0043, "step": 75 }, { "epoch": 76.0, "grad_norm": 0.03664989769458771, "learning_rate": 4.89795918367347e-05, "loss": 0.0044, "step": 76 }, { "epoch": 77.0, "grad_norm": 0.004758178256452084, "learning_rate": 4.6938775510204086e-05, "loss": 0.0043, "step": 77 }, { "epoch": 78.0, "grad_norm": 0.03650437667965889, "learning_rate": 4.4897959183673474e-05, "loss": 0.0044, "step": 78 }, { "epoch": 79.0, "grad_norm": 0.004723742604255676, "learning_rate": 4.2857142857142856e-05, "loss": 0.0043, "step": 79 }, { "epoch": 80.0, "grad_norm": 0.027179496362805367, "learning_rate": 4.0816326530612245e-05, "loss": 0.0044, "step": 80 }, { "epoch": 81.0, "grad_norm": 0.027180835604667664, "learning_rate": 3.8775510204081634e-05, "loss": 0.0044, "step": 81 }, { "epoch": 82.0, "grad_norm": 0.027134299278259277, "learning_rate": 3.673469387755102e-05, "loss": 0.0044, "step": 82 }, { "epoch": 83.0, "grad_norm": 0.027070023119449615, "learning_rate": 3.469387755102041e-05, "loss": 0.0044, "step": 83 }, { "epoch": 84.0, "grad_norm": 0.004842973779886961, "learning_rate": 3.265306122448979e-05, "loss": 0.0043, "step": 84 }, { "epoch": 85.0, "grad_norm": 0.004742528777569532, "learning_rate": 3.061224489795919e-05, "loss": 0.0043, "step": 85 }, { "epoch": 86.0, "grad_norm": 0.004724172409623861, "learning_rate": 2.857142857142857e-05, "loss": 0.0043, "step": 86 }, { "epoch": 87.0, "grad_norm": 0.03628339245915413, "learning_rate": 2.6530612244897963e-05, "loss": 0.0044, "step": 87 }, { "epoch": 88.0, "grad_norm": 0.036234479397535324, "learning_rate": 2.448979591836735e-05, "loss": 0.0044, "step": 88 }, { "epoch": 89.0, "grad_norm": 0.004697797354310751, "learning_rate": 2.2448979591836737e-05, "loss": 0.0043, "step": 89 }, { "epoch": 90.0, "grad_norm": 0.004719586111605167, "learning_rate": 2.0408163265306123e-05, "loss": 0.0043, "step": 90 }, { "epoch": 91.0, "grad_norm": 0.004701649770140648, "learning_rate": 1.836734693877551e-05, "loss": 0.0043, "step": 91 }, { "epoch": 92.0, "grad_norm": 0.026852702721953392, "learning_rate": 1.6326530612244897e-05, "loss": 0.0044, "step": 92 }, { "epoch": 93.0, "grad_norm": 0.02685590460896492, "learning_rate": 1.4285714285714285e-05, "loss": 0.0044, "step": 93 }, { "epoch": 94.0, "grad_norm": 0.0047478387132287025, "learning_rate": 1.2244897959183674e-05, "loss": 0.0043, "step": 94 }, { "epoch": 95.0, "grad_norm": 0.026850329712033272, "learning_rate": 1.0204081632653061e-05, "loss": 0.0044, "step": 95 }, { "epoch": 96.0, "grad_norm": 0.00475778803229332, "learning_rate": 8.163265306122448e-06, "loss": 0.0043, "step": 96 }, { "epoch": 97.0, "grad_norm": 0.004756512586027384, "learning_rate": 6.122448979591837e-06, "loss": 0.0043, "step": 97 }, { "epoch": 98.0, "grad_norm": 0.004749584477394819, "learning_rate": 4.081632653061224e-06, "loss": 0.0043, "step": 98 }, { "epoch": 99.0, "grad_norm": 0.004759279545396566, "learning_rate": 2.040816326530612e-06, "loss": 0.0043, "step": 99 }, { "epoch": 100.0, "grad_norm": 0.004726547747850418, "learning_rate": 0.0, "loss": 0.0043, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 760517336064000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }