{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8924033522573669, "eval_steps": 500, "global_step": 10500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.87135249376297, "learning_rate": 0.00019819808452978876, "loss": 2.1425, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.8564028739929199, "learning_rate": 0.0001963956283227154, "loss": 1.8647, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.8958914875984192, "learning_rate": 0.000194593172115642, "loss": 1.8182, "step": 300 }, { "epoch": 0.07, "grad_norm": 0.8007214069366455, "learning_rate": 0.00019279071590856862, "loss": 1.711, "step": 400 }, { "epoch": 0.09, "grad_norm": 0.8793672919273376, "learning_rate": 0.00019098825970149526, "loss": 1.686, "step": 500 }, { "epoch": 0.11, "grad_norm": 0.9486576318740845, "learning_rate": 0.00018918580349442187, "loss": 1.7068, "step": 600 }, { "epoch": 0.13, "grad_norm": 0.8130625486373901, "learning_rate": 0.0001873833472873485, "loss": 1.6457, "step": 700 }, { "epoch": 0.14, "grad_norm": 0.8903294801712036, "learning_rate": 0.00018558089108027513, "loss": 1.6285, "step": 800 }, { "epoch": 0.16, "grad_norm": 0.9597026109695435, "learning_rate": 0.0001837784348732017, "loss": 1.6305, "step": 900 }, { "epoch": 0.18, "grad_norm": 0.7700974941253662, "learning_rate": 0.00018197597866612835, "loss": 1.6443, "step": 1000 }, { "epoch": 0.2, "grad_norm": 0.8106345534324646, "learning_rate": 0.00018017352245905497, "loss": 1.5963, "step": 1100 }, { "epoch": 0.22, "grad_norm": 1.026309847831726, "learning_rate": 0.00017837106625198158, "loss": 1.5615, "step": 1200 }, { "epoch": 0.23, "grad_norm": 0.8697523474693298, "learning_rate": 0.00017656861004490822, "loss": 1.5295, "step": 1300 }, { "epoch": 0.25, "grad_norm": 1.2120341062545776, "learning_rate": 0.00017476615383783483, "loss": 1.4984, "step": 1400 }, { "epoch": 0.27, "grad_norm": 0.7356016039848328, "learning_rate": 0.00017296369763076145, "loss": 1.5311, "step": 1500 }, { "epoch": 0.29, "grad_norm": 0.8384151458740234, "learning_rate": 0.00017116124142368809, "loss": 1.5632, "step": 1600 }, { "epoch": 0.31, "grad_norm": 0.8941056132316589, "learning_rate": 0.0001693587852166147, "loss": 1.492, "step": 1700 }, { "epoch": 0.32, "grad_norm": 0.7094323039054871, "learning_rate": 0.0001675563290095413, "loss": 1.4425, "step": 1800 }, { "epoch": 0.34, "grad_norm": 0.7246663570404053, "learning_rate": 0.00016575387280246795, "loss": 1.4277, "step": 1900 }, { "epoch": 0.36, "grad_norm": 0.8121210932731628, "learning_rate": 0.00016395141659539456, "loss": 1.464, "step": 2000 }, { "epoch": 0.38, "grad_norm": 0.712011456489563, "learning_rate": 0.0001621489603883212, "loss": 1.4779, "step": 2100 }, { "epoch": 0.4, "grad_norm": 0.7419346570968628, "learning_rate": 0.00016034650418124782, "loss": 1.4669, "step": 2200 }, { "epoch": 0.41, "grad_norm": 1.0694609880447388, "learning_rate": 0.0001585440479741744, "loss": 1.4499, "step": 2300 }, { "epoch": 0.43, "grad_norm": 1.0339300632476807, "learning_rate": 0.00015674159176710104, "loss": 1.4115, "step": 2400 }, { "epoch": 0.45, "grad_norm": 1.1224662065505981, "learning_rate": 0.00015493913556002766, "loss": 1.4353, "step": 2500 }, { "epoch": 0.47, "grad_norm": 0.8455696702003479, "learning_rate": 0.0001531366793529543, "loss": 1.3991, "step": 2600 }, { "epoch": 0.49, "grad_norm": 0.8783261179924011, "learning_rate": 0.0001513342231458809, "loss": 1.4174, "step": 2700 }, { "epoch": 0.5, "grad_norm": 0.8644577264785767, "learning_rate": 0.00014953176693880752, "loss": 1.4272, "step": 2800 }, { "epoch": 0.52, "grad_norm": 0.804175853729248, "learning_rate": 0.00014772931073173416, "loss": 1.3904, "step": 2900 }, { "epoch": 0.54, "grad_norm": 0.8686081767082214, "learning_rate": 0.00014592685452466077, "loss": 1.3577, "step": 3000 }, { "epoch": 0.56, "grad_norm": 0.8131946325302124, "learning_rate": 0.0001441243983175874, "loss": 1.3798, "step": 3100 }, { "epoch": 0.58, "grad_norm": 0.9579694271087646, "learning_rate": 0.00014232194211051403, "loss": 1.3705, "step": 3200 }, { "epoch": 0.59, "grad_norm": 0.7878475785255432, "learning_rate": 0.00014051948590344064, "loss": 1.3216, "step": 3300 }, { "epoch": 0.61, "grad_norm": 0.9384462833404541, "learning_rate": 0.00013871702969636725, "loss": 1.3681, "step": 3400 }, { "epoch": 0.63, "grad_norm": 0.899638295173645, "learning_rate": 0.0001369145734892939, "loss": 1.3752, "step": 3500 }, { "epoch": 0.65, "grad_norm": 0.8509306907653809, "learning_rate": 0.0001351121172822205, "loss": 1.3253, "step": 3600 }, { "epoch": 0.67, "grad_norm": 0.712924063205719, "learning_rate": 0.00013330966107514712, "loss": 1.3318, "step": 3700 }, { "epoch": 0.68, "grad_norm": 0.8807259798049927, "learning_rate": 0.00013150720486807373, "loss": 1.3163, "step": 3800 }, { "epoch": 0.7, "grad_norm": 0.9081091284751892, "learning_rate": 0.00012970474866100034, "loss": 1.3839, "step": 3900 }, { "epoch": 0.72, "grad_norm": 1.0412542819976807, "learning_rate": 0.00012790229245392698, "loss": 1.3057, "step": 4000 }, { "epoch": 0.74, "grad_norm": 0.8416357636451721, "learning_rate": 0.0001260998362468536, "loss": 1.2548, "step": 4100 }, { "epoch": 0.76, "grad_norm": 0.8973735570907593, "learning_rate": 0.0001242973800397802, "loss": 1.3154, "step": 4200 }, { "epoch": 0.77, "grad_norm": 0.7394294738769531, "learning_rate": 0.00012249492383270685, "loss": 1.3079, "step": 4300 }, { "epoch": 0.79, "grad_norm": 1.1180624961853027, "learning_rate": 0.00012069246762563346, "loss": 1.2789, "step": 4400 }, { "epoch": 0.81, "grad_norm": 0.8885756134986877, "learning_rate": 0.00011889001141856009, "loss": 1.2959, "step": 4500 }, { "epoch": 0.83, "grad_norm": 1.1742843389511108, "learning_rate": 0.00011708755521148672, "loss": 1.2486, "step": 4600 }, { "epoch": 0.85, "grad_norm": 0.9566686153411865, "learning_rate": 0.00011528509900441333, "loss": 1.303, "step": 4700 }, { "epoch": 0.87, "grad_norm": 1.2613877058029175, "learning_rate": 0.00011348264279733996, "loss": 1.301, "step": 4800 }, { "epoch": 0.88, "grad_norm": 0.9030331969261169, "learning_rate": 0.00011168018659026658, "loss": 1.3338, "step": 4900 }, { "epoch": 0.9, "grad_norm": 1.0433690547943115, "learning_rate": 0.00010987773038319318, "loss": 1.3068, "step": 5000 }, { "epoch": 0.92, "grad_norm": 0.8587890267372131, "learning_rate": 0.00010807527417611981, "loss": 1.2632, "step": 5100 }, { "epoch": 0.94, "grad_norm": 1.0812350511550903, "learning_rate": 0.00010627281796904642, "loss": 1.278, "step": 5200 }, { "epoch": 0.96, "grad_norm": 0.8623504042625427, "learning_rate": 0.00010447036176197305, "loss": 1.2392, "step": 5300 }, { "epoch": 0.97, "grad_norm": 0.8327571749687195, "learning_rate": 0.00010266790555489967, "loss": 1.2652, "step": 5400 }, { "epoch": 0.99, "grad_norm": 0.958329975605011, "learning_rate": 0.00010086544934782629, "loss": 1.286, "step": 5500 }, { "epoch": 1.01, "grad_norm": 0.9664350748062134, "learning_rate": 9.906299314075291e-05, "loss": 1.1171, "step": 5600 }, { "epoch": 1.03, "grad_norm": 0.8452981114387512, "learning_rate": 9.726053693367954e-05, "loss": 1.0828, "step": 5700 }, { "epoch": 1.05, "grad_norm": 1.0611803531646729, "learning_rate": 9.545808072660615e-05, "loss": 1.1142, "step": 5800 }, { "epoch": 1.06, "grad_norm": 1.0450036525726318, "learning_rate": 9.365562451953278e-05, "loss": 1.1217, "step": 5900 }, { "epoch": 1.08, "grad_norm": 0.9196897745132446, "learning_rate": 9.18531683124594e-05, "loss": 1.1435, "step": 6000 }, { "epoch": 1.1, "grad_norm": 1.241141676902771, "learning_rate": 9.005071210538602e-05, "loss": 1.1174, "step": 6100 }, { "epoch": 1.12, "grad_norm": 0.8073747754096985, "learning_rate": 8.826628046038338e-05, "loss": 1.0501, "step": 6200 }, { "epoch": 1.14, "grad_norm": 0.8413310647010803, "learning_rate": 8.646382425331e-05, "loss": 1.1023, "step": 6300 }, { "epoch": 1.15, "grad_norm": 0.8178868889808655, "learning_rate": 8.466136804623662e-05, "loss": 1.0948, "step": 6400 }, { "epoch": 1.17, "grad_norm": 0.9561821222305298, "learning_rate": 8.285891183916323e-05, "loss": 1.0936, "step": 6500 }, { "epoch": 1.19, "grad_norm": 0.9246460199356079, "learning_rate": 8.105645563208986e-05, "loss": 1.0679, "step": 6600 }, { "epoch": 1.21, "grad_norm": 0.9705007076263428, "learning_rate": 7.925399942501648e-05, "loss": 1.026, "step": 6700 }, { "epoch": 1.23, "grad_norm": 0.9710861444473267, "learning_rate": 7.74515432179431e-05, "loss": 1.0543, "step": 6800 }, { "epoch": 1.24, "grad_norm": 1.0675069093704224, "learning_rate": 7.564908701086972e-05, "loss": 1.0987, "step": 6900 }, { "epoch": 1.26, "grad_norm": 0.8517453670501709, "learning_rate": 7.384663080379635e-05, "loss": 1.0693, "step": 7000 }, { "epoch": 1.28, "grad_norm": 0.901584267616272, "learning_rate": 7.204417459672296e-05, "loss": 1.0685, "step": 7100 }, { "epoch": 1.3, "grad_norm": 1.0663121938705444, "learning_rate": 7.024171838964957e-05, "loss": 1.0802, "step": 7200 }, { "epoch": 1.32, "grad_norm": 1.0489306449890137, "learning_rate": 6.84392621825762e-05, "loss": 1.1116, "step": 7300 }, { "epoch": 1.33, "grad_norm": 0.8096909523010254, "learning_rate": 6.663680597550283e-05, "loss": 1.0675, "step": 7400 }, { "epoch": 1.35, "grad_norm": 1.0951379537582397, "learning_rate": 6.483434976842945e-05, "loss": 1.0631, "step": 7500 }, { "epoch": 1.37, "grad_norm": 1.08359956741333, "learning_rate": 6.303189356135607e-05, "loss": 1.0522, "step": 7600 }, { "epoch": 1.39, "grad_norm": 1.22184419631958, "learning_rate": 6.122943735428269e-05, "loss": 1.0878, "step": 7700 }, { "epoch": 1.41, "grad_norm": 1.087251901626587, "learning_rate": 5.9426981147209305e-05, "loss": 1.0659, "step": 7800 }, { "epoch": 1.42, "grad_norm": 1.020251750946045, "learning_rate": 5.7624524940135925e-05, "loss": 1.0668, "step": 7900 }, { "epoch": 1.44, "grad_norm": 0.9591791033744812, "learning_rate": 5.582206873306255e-05, "loss": 1.0702, "step": 8000 }, { "epoch": 1.46, "grad_norm": 1.0169813632965088, "learning_rate": 5.401961252598917e-05, "loss": 1.0375, "step": 8100 }, { "epoch": 1.48, "grad_norm": 1.044224739074707, "learning_rate": 5.22171563189158e-05, "loss": 1.0777, "step": 8200 }, { "epoch": 1.5, "grad_norm": 1.0525567531585693, "learning_rate": 5.041470011184242e-05, "loss": 0.9967, "step": 8300 }, { "epoch": 1.51, "grad_norm": 0.9581038951873779, "learning_rate": 4.861224390476904e-05, "loss": 1.0595, "step": 8400 }, { "epoch": 1.53, "grad_norm": 1.0464085340499878, "learning_rate": 4.6809787697695656e-05, "loss": 1.0423, "step": 8500 }, { "epoch": 1.55, "grad_norm": 0.982803225517273, "learning_rate": 4.5007331490622276e-05, "loss": 1.0683, "step": 8600 }, { "epoch": 1.57, "grad_norm": 1.1214386224746704, "learning_rate": 4.3204875283548896e-05, "loss": 1.0984, "step": 8700 }, { "epoch": 1.59, "grad_norm": 1.0456256866455078, "learning_rate": 4.1402419076475515e-05, "loss": 1.0549, "step": 8800 }, { "epoch": 1.6, "grad_norm": 1.0025187730789185, "learning_rate": 3.959996286940214e-05, "loss": 1.024, "step": 8900 }, { "epoch": 1.62, "grad_norm": 1.2760844230651855, "learning_rate": 3.7797506662328755e-05, "loss": 1.0313, "step": 9000 }, { "epoch": 1.64, "grad_norm": 0.9632763862609863, "learning_rate": 3.599505045525538e-05, "loss": 1.0263, "step": 9100 }, { "epoch": 1.66, "grad_norm": 1.01961088180542, "learning_rate": 3.4192594248182e-05, "loss": 1.0503, "step": 9200 }, { "epoch": 1.68, "grad_norm": 0.9579876065254211, "learning_rate": 3.239013804110862e-05, "loss": 1.03, "step": 9300 }, { "epoch": 1.69, "grad_norm": 1.400282859802246, "learning_rate": 3.058768183403524e-05, "loss": 1.0481, "step": 9400 }, { "epoch": 1.71, "grad_norm": 1.1665406227111816, "learning_rate": 2.8785225626961863e-05, "loss": 1.0585, "step": 9500 }, { "epoch": 1.73, "grad_norm": 1.1331160068511963, "learning_rate": 2.6982769419888486e-05, "loss": 0.9992, "step": 9600 }, { "epoch": 1.75, "grad_norm": 1.0598838329315186, "learning_rate": 2.5180313212815106e-05, "loss": 1.0663, "step": 9700 }, { "epoch": 1.77, "grad_norm": 1.0826873779296875, "learning_rate": 2.3377857005741726e-05, "loss": 0.9844, "step": 9800 }, { "epoch": 1.78, "grad_norm": 0.9529953002929688, "learning_rate": 2.1575400798668345e-05, "loss": 1.0349, "step": 9900 }, { "epoch": 1.8, "grad_norm": 1.0949389934539795, "learning_rate": 1.977294459159497e-05, "loss": 1.0473, "step": 10000 }, { "epoch": 1.82, "grad_norm": 1.0248372554779053, "learning_rate": 1.7970488384521588e-05, "loss": 1.0382, "step": 10100 }, { "epoch": 1.84, "grad_norm": 0.9931679368019104, "learning_rate": 1.616803217744821e-05, "loss": 1.017, "step": 10200 }, { "epoch": 1.86, "grad_norm": 0.9561355710029602, "learning_rate": 1.436557597037483e-05, "loss": 1.0577, "step": 10300 }, { "epoch": 1.87, "grad_norm": 1.2075040340423584, "learning_rate": 1.2563119763301454e-05, "loss": 1.0493, "step": 10400 }, { "epoch": 1.89, "grad_norm": 1.1561285257339478, "learning_rate": 1.0760663556228073e-05, "loss": 0.9947, "step": 10500 } ], "logging_steps": 100, "max_steps": 11096, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 3.714097560675041e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }