{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": NaN, "learning_rate": 0.0, "loss": 7.3765, "step": 1 }, { "epoch": 2.0, "grad_norm": 2.7495672702789307, "learning_rate": 0.0001, "loss": 7.3765, "step": 2 }, { "epoch": 3.0, "grad_norm": 2.7495672702789307, "learning_rate": 0.0002, "loss": 7.3765, "step": 3 }, { "epoch": 4.0, "grad_norm": 3.6275627613067627, "learning_rate": 0.00019795918367346938, "loss": 6.994, "step": 4 }, { "epoch": 5.0, "grad_norm": 7.796189308166504, "learning_rate": 0.0001959183673469388, "loss": 6.341, "step": 5 }, { "epoch": 6.0, "grad_norm": 11.919865608215332, "learning_rate": 0.00019387755102040816, "loss": 5.805, "step": 6 }, { "epoch": 7.0, "grad_norm": Infinity, "learning_rate": 0.00019387755102040816, "loss": 5.2771, "step": 7 }, { "epoch": 8.0, "grad_norm": 15.628558158874512, "learning_rate": 0.00019183673469387756, "loss": 5.2771, "step": 8 }, { "epoch": 9.0, "grad_norm": 18.900388717651367, "learning_rate": 0.00018979591836734697, "loss": 4.7626, "step": 9 }, { "epoch": 10.0, "grad_norm": 21.62285614013672, "learning_rate": 0.00018775510204081634, "loss": 4.2169, "step": 10 }, { "epoch": 11.0, "grad_norm": 23.690582275390625, "learning_rate": 0.00018571428571428572, "loss": 3.623, "step": 11 }, { "epoch": 12.0, "grad_norm": 25.02626609802246, "learning_rate": 0.00018367346938775512, "loss": 2.9824, "step": 12 }, { "epoch": 13.0, "grad_norm": 25.598007202148438, "learning_rate": 0.0001816326530612245, "loss": 2.3122, "step": 13 }, { "epoch": 14.0, "grad_norm": 25.378807067871094, "learning_rate": 0.0001795918367346939, "loss": 1.6226, "step": 14 }, { "epoch": 15.0, "grad_norm": 24.527645111083984, "learning_rate": 0.00017755102040816327, "loss": 0.9334, "step": 15 }, { "epoch": 16.0, "grad_norm": 23.03998565673828, "learning_rate": 0.00017551020408163265, "loss": 0.2465, "step": 16 }, { "epoch": 17.0, "grad_norm": 4.810272216796875, "learning_rate": 0.00017346938775510205, "loss": 0.2217, "step": 17 }, { "epoch": 18.0, "grad_norm": 5.40369987487793, "learning_rate": 0.00017142857142857143, "loss": 0.2093, "step": 18 }, { "epoch": 19.0, "grad_norm": 5.298532962799072, "learning_rate": 0.00016938775510204083, "loss": 0.1797, "step": 19 }, { "epoch": 20.0, "grad_norm": 4.629075050354004, "learning_rate": 0.00016734693877551023, "loss": 0.1339, "step": 20 }, { "epoch": 21.0, "grad_norm": 1.2457849979400635, "learning_rate": 0.0001653061224489796, "loss": 0.092, "step": 21 }, { "epoch": 22.0, "grad_norm": 0.8375206589698792, "learning_rate": 0.00016326530612244898, "loss": 0.0933, "step": 22 }, { "epoch": 23.0, "grad_norm": 0.7440481185913086, "learning_rate": 0.00016122448979591838, "loss": 0.081, "step": 23 }, { "epoch": 24.0, "grad_norm": 0.728550910949707, "learning_rate": 0.00015918367346938776, "loss": 0.0715, "step": 24 }, { "epoch": 25.0, "grad_norm": 0.729324460029602, "learning_rate": 0.00015714285714285716, "loss": 0.0583, "step": 25 }, { "epoch": 26.0, "grad_norm": 0.7445201873779297, "learning_rate": 0.00015510204081632654, "loss": 0.044, "step": 26 }, { "epoch": 27.0, "grad_norm": 0.64507657289505, "learning_rate": 0.0001530612244897959, "loss": 0.0256, "step": 27 }, { "epoch": 28.0, "grad_norm": 0.3869144916534424, "learning_rate": 0.0001510204081632653, "loss": 0.0138, "step": 28 }, { "epoch": 29.0, "grad_norm": 0.17224831879138947, "learning_rate": 0.00014897959183673472, "loss": 0.0087, "step": 29 }, { "epoch": 30.0, "grad_norm": 0.0585104376077652, "learning_rate": 0.0001469387755102041, "loss": 0.0072, "step": 30 }, { "epoch": 31.0, "grad_norm": 0.18696996569633484, "learning_rate": 0.0001448979591836735, "loss": 0.0081, "step": 31 }, { "epoch": 32.0, "grad_norm": 0.10075689852237701, "learning_rate": 0.00014285714285714287, "loss": 0.0072, "step": 32 }, { "epoch": 33.0, "grad_norm": 0.04343040660023689, "learning_rate": 0.00014081632653061224, "loss": 0.0069, "step": 33 }, { "epoch": 34.0, "grad_norm": 0.13335004448890686, "learning_rate": 0.00013877551020408165, "loss": 0.0074, "step": 34 }, { "epoch": 35.0, "grad_norm": 0.0894094929099083, "learning_rate": 0.00013673469387755102, "loss": 0.007, "step": 35 }, { "epoch": 36.0, "grad_norm": 0.01999577507376671, "learning_rate": 0.0001346938775510204, "loss": 0.0067, "step": 36 }, { "epoch": 37.0, "grad_norm": 0.1184980571269989, "learning_rate": 0.0001326530612244898, "loss": 0.0072, "step": 37 }, { "epoch": 38.0, "grad_norm": 0.09607323259115219, "learning_rate": 0.00013061224489795917, "loss": 0.007, "step": 38 }, { "epoch": 39.0, "grad_norm": 0.027331219986081123, "learning_rate": 0.00012857142857142858, "loss": 0.0067, "step": 39 }, { "epoch": 40.0, "grad_norm": 0.08817232400178909, "learning_rate": 0.00012653061224489798, "loss": 0.0069, "step": 40 }, { "epoch": 41.0, "grad_norm": 0.08792853355407715, "learning_rate": 0.00012448979591836735, "loss": 0.0069, "step": 41 }, { "epoch": 42.0, "grad_norm": 0.04289069399237633, "learning_rate": 0.00012244897959183676, "loss": 0.0067, "step": 42 }, { "epoch": 43.0, "grad_norm": 0.04996877163648605, "learning_rate": 0.00012040816326530613, "loss": 0.0067, "step": 43 }, { "epoch": 44.0, "grad_norm": 0.07244863361120224, "learning_rate": 0.00011836734693877552, "loss": 0.0068, "step": 44 }, { "epoch": 45.0, "grad_norm": 0.07215742021799088, "learning_rate": 0.0001163265306122449, "loss": 0.0068, "step": 45 }, { "epoch": 46.0, "grad_norm": 0.01955232582986355, "learning_rate": 0.00011428571428571428, "loss": 0.0067, "step": 46 }, { "epoch": 47.0, "grad_norm": 0.06493868678808212, "learning_rate": 0.00011224489795918367, "loss": 0.0068, "step": 47 }, { "epoch": 48.0, "grad_norm": 0.06490014493465424, "learning_rate": 0.00011020408163265306, "loss": 0.0068, "step": 48 }, { "epoch": 49.0, "grad_norm": 0.019649550318717957, "learning_rate": 0.00010816326530612246, "loss": 0.0067, "step": 49 }, { "epoch": 50.0, "grad_norm": 0.04920223355293274, "learning_rate": 0.00010612244897959185, "loss": 0.0067, "step": 50 }, { "epoch": 51.0, "grad_norm": 0.07163064181804657, "learning_rate": 0.00010408163265306123, "loss": 0.0068, "step": 51 }, { "epoch": 52.0, "grad_norm": 0.005953885614871979, "learning_rate": 0.00010204081632653062, "loss": 0.0066, "step": 52 }, { "epoch": 53.0, "grad_norm": 0.01944654807448387, "learning_rate": 0.0001, "loss": 0.0066, "step": 53 }, { "epoch": 54.0, "grad_norm": 0.0421106182038784, "learning_rate": 9.79591836734694e-05, "loss": 0.0067, "step": 54 }, { "epoch": 55.0, "grad_norm": 0.019489118829369545, "learning_rate": 9.591836734693878e-05, "loss": 0.0066, "step": 55 }, { "epoch": 56.0, "grad_norm": 0.004421094432473183, "learning_rate": 9.387755102040817e-05, "loss": 0.0066, "step": 56 }, { "epoch": 57.0, "grad_norm": 0.026416227221488953, "learning_rate": 9.183673469387756e-05, "loss": 0.0067, "step": 57 }, { "epoch": 58.0, "grad_norm": 0.003954235929995775, "learning_rate": 8.979591836734695e-05, "loss": 0.0066, "step": 58 }, { "epoch": 59.0, "grad_norm": 0.003926219418644905, "learning_rate": 8.775510204081632e-05, "loss": 0.0066, "step": 59 }, { "epoch": 60.0, "grad_norm": 0.0038123615086078644, "learning_rate": 8.571428571428571e-05, "loss": 0.0066, "step": 60 }, { "epoch": 61.0, "grad_norm": 0.003582009579986334, "learning_rate": 8.367346938775511e-05, "loss": 0.0066, "step": 61 }, { "epoch": 62.0, "grad_norm": 0.0035740730818361044, "learning_rate": 8.163265306122449e-05, "loss": 0.0066, "step": 62 }, { "epoch": 63.0, "grad_norm": 0.01964273676276207, "learning_rate": 7.959183673469388e-05, "loss": 0.0066, "step": 63 }, { "epoch": 64.0, "grad_norm": 0.01971287839114666, "learning_rate": 7.755102040816327e-05, "loss": 0.0066, "step": 64 }, { "epoch": 65.0, "grad_norm": 0.0035709121730178595, "learning_rate": 7.551020408163266e-05, "loss": 0.0066, "step": 65 }, { "epoch": 66.0, "grad_norm": 0.003548271721228957, "learning_rate": 7.346938775510205e-05, "loss": 0.0066, "step": 66 }, { "epoch": 67.0, "grad_norm": 0.02695435844361782, "learning_rate": 7.142857142857143e-05, "loss": 0.0066, "step": 67 }, { "epoch": 68.0, "grad_norm": 0.026985742151737213, "learning_rate": 6.938775510204082e-05, "loss": 0.0066, "step": 68 }, { "epoch": 69.0, "grad_norm": 0.00358410133048892, "learning_rate": 6.73469387755102e-05, "loss": 0.0066, "step": 69 }, { "epoch": 70.0, "grad_norm": 0.04342804476618767, "learning_rate": 6.530612244897959e-05, "loss": 0.0067, "step": 70 }, { "epoch": 71.0, "grad_norm": 0.020023003220558167, "learning_rate": 6.326530612244899e-05, "loss": 0.0066, "step": 71 }, { "epoch": 72.0, "grad_norm": 0.020061027258634567, "learning_rate": 6.122448979591838e-05, "loss": 0.0066, "step": 72 }, { "epoch": 73.0, "grad_norm": 0.003791953669860959, "learning_rate": 5.918367346938776e-05, "loss": 0.0066, "step": 73 }, { "epoch": 74.0, "grad_norm": 0.050881966948509216, "learning_rate": 5.714285714285714e-05, "loss": 0.0067, "step": 74 }, { "epoch": 75.0, "grad_norm": 0.027295473963022232, "learning_rate": 5.510204081632653e-05, "loss": 0.0066, "step": 75 }, { "epoch": 76.0, "grad_norm": 0.0037258469965308905, "learning_rate": 5.3061224489795926e-05, "loss": 0.0066, "step": 76 }, { "epoch": 77.0, "grad_norm": 0.020169131457805634, "learning_rate": 5.102040816326531e-05, "loss": 0.0066, "step": 77 }, { "epoch": 78.0, "grad_norm": 0.04392065480351448, "learning_rate": 4.89795918367347e-05, "loss": 0.0067, "step": 78 }, { "epoch": 79.0, "grad_norm": 0.02023773454129696, "learning_rate": 4.6938775510204086e-05, "loss": 0.0066, "step": 79 }, { "epoch": 80.0, "grad_norm": 0.003931655548512936, "learning_rate": 4.4897959183673474e-05, "loss": 0.0066, "step": 80 }, { "epoch": 81.0, "grad_norm": 0.027433717623353004, "learning_rate": 4.2857142857142856e-05, "loss": 0.0066, "step": 81 }, { "epoch": 82.0, "grad_norm": 0.027440495789051056, "learning_rate": 4.0816326530612245e-05, "loss": 0.0066, "step": 82 }, { "epoch": 83.0, "grad_norm": 0.003971911035478115, "learning_rate": 3.8775510204081634e-05, "loss": 0.0066, "step": 83 }, { "epoch": 84.0, "grad_norm": 0.0040692477487027645, "learning_rate": 3.673469387755102e-05, "loss": 0.0066, "step": 84 }, { "epoch": 85.0, "grad_norm": 0.02032075822353363, "learning_rate": 3.469387755102041e-05, "loss": 0.0066, "step": 85 }, { "epoch": 86.0, "grad_norm": 0.004029002971947193, "learning_rate": 3.265306122448979e-05, "loss": 0.0066, "step": 86 }, { "epoch": 87.0, "grad_norm": 0.02034132555127144, "learning_rate": 3.061224489795919e-05, "loss": 0.0066, "step": 87 }, { "epoch": 88.0, "grad_norm": 0.003994234371930361, "learning_rate": 2.857142857142857e-05, "loss": 0.0066, "step": 88 }, { "epoch": 89.0, "grad_norm": 0.004034143406897783, "learning_rate": 2.6530612244897963e-05, "loss": 0.0066, "step": 89 }, { "epoch": 90.0, "grad_norm": 0.004001120571047068, "learning_rate": 2.448979591836735e-05, "loss": 0.0066, "step": 90 }, { "epoch": 91.0, "grad_norm": 0.020308438688516617, "learning_rate": 2.2448979591836737e-05, "loss": 0.0066, "step": 91 }, { "epoch": 92.0, "grad_norm": 0.004174523055553436, "learning_rate": 2.0408163265306123e-05, "loss": 0.0066, "step": 92 }, { "epoch": 93.0, "grad_norm": 0.004282441921532154, "learning_rate": 1.836734693877551e-05, "loss": 0.0066, "step": 93 }, { "epoch": 94.0, "grad_norm": 0.004000538494437933, "learning_rate": 1.6326530612244897e-05, "loss": 0.0066, "step": 94 }, { "epoch": 95.0, "grad_norm": 0.003992615267634392, "learning_rate": 1.4285714285714285e-05, "loss": 0.0066, "step": 95 }, { "epoch": 96.0, "grad_norm": 0.0273627657443285, "learning_rate": 1.2244897959183674e-05, "loss": 0.0066, "step": 96 }, { "epoch": 97.0, "grad_norm": 0.027324387803673744, "learning_rate": 1.0204081632653061e-05, "loss": 0.0066, "step": 97 }, { "epoch": 98.0, "grad_norm": 0.027326995506882668, "learning_rate": 8.163265306122448e-06, "loss": 0.0066, "step": 98 }, { "epoch": 99.0, "grad_norm": 0.020300107076764107, "learning_rate": 6.122448979591837e-06, "loss": 0.0066, "step": 99 }, { "epoch": 100.0, "grad_norm": 0.003976646810770035, "learning_rate": 4.081632653061224e-06, "loss": 0.0066, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 499235306496000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }