{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 3.3439955711364746, "learning_rate": 0.0001, "loss": 5.4154, "step": 1 }, { "epoch": 2.0, "grad_norm": 3.3439955711364746, "learning_rate": 0.0002, "loss": 5.4154, "step": 2 }, { "epoch": 3.0, "grad_norm": 4.64585542678833, "learning_rate": 0.00019795918367346938, "loss": 5.1115, "step": 3 }, { "epoch": 4.0, "grad_norm": 7.641846656799316, "learning_rate": 0.0001959183673469388, "loss": 4.5815, "step": 4 }, { "epoch": 5.0, "grad_norm": 10.534232139587402, "learning_rate": 0.00019387755102040816, "loss": 4.1388, "step": 5 }, { "epoch": 6.0, "grad_norm": Infinity, "learning_rate": 0.00019387755102040816, "loss": 3.7196, "step": 6 }, { "epoch": 7.0, "grad_norm": 13.133545875549316, "learning_rate": 0.00019183673469387756, "loss": 3.7196, "step": 7 }, { "epoch": 8.0, "grad_norm": 15.314064025878906, "learning_rate": 0.00018979591836734697, "loss": 3.2758, "step": 8 }, { "epoch": 9.0, "grad_norm": 16.992698669433594, "learning_rate": 0.00018775510204081634, "loss": 2.8057, "step": 9 }, { "epoch": 10.0, "grad_norm": 18.105308532714844, "learning_rate": 0.00018571428571428572, "loss": 2.3037, "step": 10 }, { "epoch": 11.0, "grad_norm": 18.624324798583984, "learning_rate": 0.00018367346938775512, "loss": 1.7698, "step": 11 }, { "epoch": 12.0, "grad_norm": 18.554916381835938, "learning_rate": 0.0001816326530612245, "loss": 1.2219, "step": 12 }, { "epoch": 13.0, "grad_norm": 17.93878746032715, "learning_rate": 0.0001795918367346939, "loss": 0.6767, "step": 13 }, { "epoch": 14.0, "grad_norm": 2.917646884918213, "learning_rate": 0.00017755102040816327, "loss": 0.1873, "step": 14 }, { "epoch": 15.0, "grad_norm": 3.2019948959350586, "learning_rate": 0.00017551020408163265, "loss": 0.1488, "step": 15 }, { "epoch": 16.0, "grad_norm": 3.1802170276641846, "learning_rate": 0.00017346938775510205, "loss": 0.1134, "step": 16 }, { "epoch": 17.0, "grad_norm": 1.5858300924301147, "learning_rate": 0.00017142857142857143, "loss": 0.0832, "step": 17 }, { "epoch": 18.0, "grad_norm": 0.5080211758613586, "learning_rate": 0.00016938775510204083, "loss": 0.0701, "step": 18 }, { "epoch": 19.0, "grad_norm": 0.5199990272521973, "learning_rate": 0.00016734693877551023, "loss": 0.0623, "step": 19 }, { "epoch": 20.0, "grad_norm": 0.5190378427505493, "learning_rate": 0.0001653061224489796, "loss": 0.0552, "step": 20 }, { "epoch": 21.0, "grad_norm": 0.5189256072044373, "learning_rate": 0.00016326530612244898, "loss": 0.0456, "step": 21 }, { "epoch": 22.0, "grad_norm": 0.5494657754898071, "learning_rate": 0.00016122448979591838, "loss": 0.0356, "step": 22 }, { "epoch": 23.0, "grad_norm": 0.5577398538589478, "learning_rate": 0.00015918367346938776, "loss": 0.024, "step": 23 }, { "epoch": 24.0, "grad_norm": 0.39652666449546814, "learning_rate": 0.00015714285714285716, "loss": 0.0128, "step": 24 }, { "epoch": 25.0, "grad_norm": 0.12145466357469559, "learning_rate": 0.00015510204081632654, "loss": 0.0064, "step": 25 }, { "epoch": 26.0, "grad_norm": 0.16227224469184875, "learning_rate": 0.0001530612244897959, "loss": 0.0058, "step": 26 }, { "epoch": 27.0, "grad_norm": 0.010005966760218143, "learning_rate": 0.0001510204081632653, "loss": 0.0046, "step": 27 }, { "epoch": 28.0, "grad_norm": 0.11974961310625076, "learning_rate": 0.00014897959183673472, "loss": 0.0053, "step": 28 }, { "epoch": 29.0, "grad_norm": 0.026845891028642654, "learning_rate": 0.0001469387755102041, "loss": 0.0046, "step": 29 }, { "epoch": 30.0, "grad_norm": 0.09766196459531784, "learning_rate": 0.0001448979591836735, "loss": 0.0053, "step": 30 }, { "epoch": 31.0, "grad_norm": 0.05061652511358261, "learning_rate": 0.00014285714285714287, "loss": 0.0046, "step": 31 }, { "epoch": 32.0, "grad_norm": 0.0490519218146801, "learning_rate": 0.00014081632653061224, "loss": 0.0048, "step": 32 }, { "epoch": 33.0, "grad_norm": 0.06446336209774017, "learning_rate": 0.00013877551020408165, "loss": 0.0049, "step": 33 }, { "epoch": 34.0, "grad_norm": 0.007198956795036793, "learning_rate": 0.00013673469387755102, "loss": 0.0045, "step": 34 }, { "epoch": 35.0, "grad_norm": 0.06024309992790222, "learning_rate": 0.0001346938775510204, "loss": 0.0048, "step": 35 }, { "epoch": 36.0, "grad_norm": 0.04099668189883232, "learning_rate": 0.0001326530612244898, "loss": 0.0048, "step": 36 }, { "epoch": 37.0, "grad_norm": 0.030607614666223526, "learning_rate": 0.00013061224489795917, "loss": 0.0047, "step": 37 }, { "epoch": 38.0, "grad_norm": 0.04647354409098625, "learning_rate": 0.00012857142857142858, "loss": 0.0048, "step": 38 }, { "epoch": 39.0, "grad_norm": 0.013468354940414429, "learning_rate": 0.00012653061224489798, "loss": 0.0047, "step": 39 }, { "epoch": 40.0, "grad_norm": 0.02858574502170086, "learning_rate": 0.00012448979591836735, "loss": 0.0045, "step": 40 }, { "epoch": 41.0, "grad_norm": 0.039958544075489044, "learning_rate": 0.00012244897959183676, "loss": 0.0048, "step": 41 }, { "epoch": 42.0, "grad_norm": 0.003950281068682671, "learning_rate": 0.00012040816326530613, "loss": 0.0047, "step": 42 }, { "epoch": 43.0, "grad_norm": 0.029744163155555725, "learning_rate": 0.00011836734693877552, "loss": 0.0047, "step": 43 }, { "epoch": 44.0, "grad_norm": 0.029589517042040825, "learning_rate": 0.0001163265306122449, "loss": 0.0047, "step": 44 }, { "epoch": 45.0, "grad_norm": 0.0025897289160639048, "learning_rate": 0.00011428571428571428, "loss": 0.0049, "step": 45 }, { "epoch": 46.0, "grad_norm": 0.03239087387919426, "learning_rate": 0.00011224489795918367, "loss": 0.005, "step": 46 }, { "epoch": 47.0, "grad_norm": 0.021254172548651695, "learning_rate": 0.00011020408163265306, "loss": 0.0047, "step": 47 }, { "epoch": 48.0, "grad_norm": 0.01296665333211422, "learning_rate": 0.00010816326530612246, "loss": 0.0047, "step": 48 }, { "epoch": 49.0, "grad_norm": 0.012932351790368557, "learning_rate": 0.00010612244897959185, "loss": 0.0047, "step": 49 }, { "epoch": 50.0, "grad_norm": 0.0024836526717990637, "learning_rate": 0.00010408163265306123, "loss": 0.0049, "step": 50 }, { "epoch": 51.0, "grad_norm": 0.012770796194672585, "learning_rate": 0.00010204081632653062, "loss": 0.0047, "step": 51 }, { "epoch": 52.0, "grad_norm": 0.010142244398593903, "learning_rate": 0.0001, "loss": 0.0045, "step": 52 }, { "epoch": 53.0, "grad_norm": 0.020766526460647583, "learning_rate": 9.79591836734694e-05, "loss": 0.0047, "step": 53 }, { "epoch": 54.0, "grad_norm": 0.0039475164376199245, "learning_rate": 9.591836734693878e-05, "loss": 0.0047, "step": 54 }, { "epoch": 55.0, "grad_norm": 0.012760567478835583, "learning_rate": 9.387755102040817e-05, "loss": 0.0047, "step": 55 }, { "epoch": 56.0, "grad_norm": 0.028441807255148888, "learning_rate": 9.183673469387756e-05, "loss": 0.0047, "step": 56 }, { "epoch": 57.0, "grad_norm": 0.004045899026095867, "learning_rate": 8.979591836734695e-05, "loss": 0.0047, "step": 57 }, { "epoch": 58.0, "grad_norm": 0.020622774958610535, "learning_rate": 8.775510204081632e-05, "loss": 0.0047, "step": 58 }, { "epoch": 59.0, "grad_norm": 0.010014132596552372, "learning_rate": 8.571428571428571e-05, "loss": 0.0045, "step": 59 }, { "epoch": 60.0, "grad_norm": 0.0038325833156704903, "learning_rate": 8.367346938775511e-05, "loss": 0.0047, "step": 60 }, { "epoch": 61.0, "grad_norm": 0.01254805363714695, "learning_rate": 8.163265306122449e-05, "loss": 0.0047, "step": 61 }, { "epoch": 62.0, "grad_norm": 0.012542281299829483, "learning_rate": 7.959183673469388e-05, "loss": 0.0047, "step": 62 }, { "epoch": 63.0, "grad_norm": 0.006473184563219547, "learning_rate": 7.755102040816327e-05, "loss": 0.0045, "step": 63 }, { "epoch": 64.0, "grad_norm": 0.014362330548465252, "learning_rate": 7.551020408163266e-05, "loss": 0.0049, "step": 64 }, { "epoch": 65.0, "grad_norm": 0.0204475000500679, "learning_rate": 7.346938775510205e-05, "loss": 0.0047, "step": 65 }, { "epoch": 66.0, "grad_norm": 0.0023198979906737804, "learning_rate": 7.142857142857143e-05, "loss": 0.0049, "step": 66 }, { "epoch": 67.0, "grad_norm": 0.018235381692647934, "learning_rate": 6.938775510204082e-05, "loss": 0.0049, "step": 67 }, { "epoch": 68.0, "grad_norm": 0.012397863902151585, "learning_rate": 6.73469387755102e-05, "loss": 0.0047, "step": 68 }, { "epoch": 69.0, "grad_norm": 0.003842939855530858, "learning_rate": 6.530612244897959e-05, "loss": 0.0047, "step": 69 }, { "epoch": 70.0, "grad_norm": 0.003821918275207281, "learning_rate": 6.326530612244899e-05, "loss": 0.0047, "step": 70 }, { "epoch": 71.0, "grad_norm": 0.0038236272521317005, "learning_rate": 6.122448979591838e-05, "loss": 0.0047, "step": 71 }, { "epoch": 72.0, "grad_norm": 0.009961229749023914, "learning_rate": 5.918367346938776e-05, "loss": 0.0045, "step": 72 }, { "epoch": 73.0, "grad_norm": 0.0037804129533469677, "learning_rate": 5.714285714285714e-05, "loss": 0.0047, "step": 73 }, { "epoch": 74.0, "grad_norm": 0.003801505547016859, "learning_rate": 5.510204081632653e-05, "loss": 0.0047, "step": 74 }, { "epoch": 75.0, "grad_norm": 0.012337015941739082, "learning_rate": 5.3061224489795926e-05, "loss": 0.0047, "step": 75 }, { "epoch": 76.0, "grad_norm": 0.012310854159295559, "learning_rate": 5.102040816326531e-05, "loss": 0.0047, "step": 76 }, { "epoch": 77.0, "grad_norm": 0.0063883536495268345, "learning_rate": 4.89795918367347e-05, "loss": 0.0045, "step": 77 }, { "epoch": 78.0, "grad_norm": 0.0036704791709780693, "learning_rate": 4.6938775510204086e-05, "loss": 0.0047, "step": 78 }, { "epoch": 79.0, "grad_norm": 0.0038380566984415054, "learning_rate": 4.4897959183673474e-05, "loss": 0.0047, "step": 79 }, { "epoch": 80.0, "grad_norm": 0.013979257084429264, "learning_rate": 4.2857142857142856e-05, "loss": 0.0049, "step": 80 }, { "epoch": 81.0, "grad_norm": 0.003754157805815339, "learning_rate": 4.0816326530612245e-05, "loss": 0.0047, "step": 81 }, { "epoch": 82.0, "grad_norm": 0.003731819801032543, "learning_rate": 3.8775510204081634e-05, "loss": 0.0047, "step": 82 }, { "epoch": 83.0, "grad_norm": 0.003740091575309634, "learning_rate": 3.673469387755102e-05, "loss": 0.0047, "step": 83 }, { "epoch": 84.0, "grad_norm": 0.002331367926672101, "learning_rate": 3.469387755102041e-05, "loss": 0.0049, "step": 84 }, { "epoch": 85.0, "grad_norm": 0.006460592150688171, "learning_rate": 3.265306122448979e-05, "loss": 0.0045, "step": 85 }, { "epoch": 86.0, "grad_norm": 0.01226158905774355, "learning_rate": 3.061224489795919e-05, "loss": 0.0047, "step": 86 }, { "epoch": 87.0, "grad_norm": 0.012233145534992218, "learning_rate": 2.857142857142857e-05, "loss": 0.0047, "step": 87 }, { "epoch": 88.0, "grad_norm": 0.002268604002892971, "learning_rate": 2.6530612244897963e-05, "loss": 0.0049, "step": 88 }, { "epoch": 89.0, "grad_norm": 0.009719762951135635, "learning_rate": 2.448979591836735e-05, "loss": 0.0045, "step": 89 }, { "epoch": 90.0, "grad_norm": 0.003743007080629468, "learning_rate": 2.2448979591836737e-05, "loss": 0.0047, "step": 90 }, { "epoch": 91.0, "grad_norm": 0.00371920526959002, "learning_rate": 2.0408163265306123e-05, "loss": 0.0047, "step": 91 }, { "epoch": 92.0, "grad_norm": 0.0036711846478283405, "learning_rate": 1.836734693877551e-05, "loss": 0.0047, "step": 92 }, { "epoch": 93.0, "grad_norm": 0.020141731947660446, "learning_rate": 1.6326530612244897e-05, "loss": 0.0047, "step": 93 }, { "epoch": 94.0, "grad_norm": 0.0037132452707737684, "learning_rate": 1.4285714285714285e-05, "loss": 0.0047, "step": 94 }, { "epoch": 95.0, "grad_norm": 0.0063994950614869595, "learning_rate": 1.2244897959183674e-05, "loss": 0.0045, "step": 95 }, { "epoch": 96.0, "grad_norm": 0.0037392491940408945, "learning_rate": 1.0204081632653061e-05, "loss": 0.0047, "step": 96 }, { "epoch": 97.0, "grad_norm": 0.003673990024253726, "learning_rate": 8.163265306122448e-06, "loss": 0.0047, "step": 97 }, { "epoch": 98.0, "grad_norm": 0.012367010116577148, "learning_rate": 6.122448979591837e-06, "loss": 0.0047, "step": 98 }, { "epoch": 99.0, "grad_norm": 0.002525837393477559, "learning_rate": 4.081632653061224e-06, "loss": 0.0049, "step": 99 }, { "epoch": 100.0, "grad_norm": 0.012382498942315578, "learning_rate": 2.040816326530612e-06, "loss": 0.0047, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 713859830784000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }