{ "best_metric": 1.3743919134140015, "best_model_checkpoint": "./llama3/21-04-24-Weni-WeniGPT-Agents-Llama3-1.0.8-SFT_Experiment with SFT and Llama3 and updates in requirements-2_max_steps-669_batch_2_2024-04-21_ppid_2917/checkpoint-360", "epoch": 2.8187919463087248, "eval_steps": 30, "global_step": 630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0447427293064877, "grad_norm": 0.3199349641799927, "learning_rate": 0.0001, "loss": 1.9361, "step": 10 }, { "epoch": 0.0894854586129754, "grad_norm": 0.9833651781082153, "learning_rate": 0.0002, "loss": 1.8397, "step": 20 }, { "epoch": 0.1342281879194631, "grad_norm": 1.0959680080413818, "learning_rate": 0.00019990511519199923, "loss": 1.7877, "step": 30 }, { "epoch": 0.1342281879194631, "eval_loss": 1.7081660032272339, "eval_runtime": 10.9642, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 30 }, { "epoch": 0.1789709172259508, "grad_norm": 1.2613288164138794, "learning_rate": 0.00019957734949126304, "loss": 1.7523, "step": 40 }, { "epoch": 0.22371364653243847, "grad_norm": 0.7975640892982483, "learning_rate": 0.0001990162991697884, "loss": 1.516, "step": 50 }, { "epoch": 0.2684563758389262, "grad_norm": 0.71113121509552, "learning_rate": 0.0001982232786270059, "loss": 1.3675, "step": 60 }, { "epoch": 0.2684563758389262, "eval_loss": 1.4538882970809937, "eval_runtime": 10.9625, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 60 }, { "epoch": 0.3131991051454139, "grad_norm": 0.8126013875007629, "learning_rate": 0.00019720014571008158, "loss": 1.4173, "step": 70 }, { "epoch": 0.3579418344519016, "grad_norm": 0.6399104595184326, "learning_rate": 0.00019594929736144976, "loss": 1.5249, "step": 80 }, { "epoch": 0.40268456375838924, "grad_norm": 0.7685297131538391, "learning_rate": 0.00019447366400338116, "loss": 1.3782, "step": 90 }, { "epoch": 0.40268456375838924, "eval_loss": 1.4235661029815674, "eval_runtime": 10.9616, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 90 }, { "epoch": 0.44742729306487694, "grad_norm": 0.7407508492469788, "learning_rate": 0.00019277670267274258, "loss": 1.3854, "step": 100 }, { "epoch": 0.49217002237136465, "grad_norm": 0.5381172895431519, "learning_rate": 0.0001908623889220311, "loss": 1.3216, "step": 110 }, { "epoch": 0.5369127516778524, "grad_norm": 0.8507822155952454, "learning_rate": 0.00018873520750565718, "loss": 1.3884, "step": 120 }, { "epoch": 0.5369127516778524, "eval_loss": 1.3937652111053467, "eval_runtime": 11.3392, "eval_samples_per_second": 4.057, "eval_steps_per_second": 4.057, "step": 120 }, { "epoch": 0.5816554809843401, "grad_norm": 0.7832561135292053, "learning_rate": 0.00018640014187329578, "loss": 1.3304, "step": 130 }, { "epoch": 0.6263982102908278, "grad_norm": 0.7797898054122925, "learning_rate": 0.00018386266249492057, "loss": 1.4979, "step": 140 }, { "epoch": 0.6711409395973155, "grad_norm": 0.8133324384689331, "learning_rate": 0.00018112871404487202, "loss": 1.3448, "step": 150 }, { "epoch": 0.6711409395973155, "eval_loss": 1.3898664712905884, "eval_runtime": 10.9657, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 150 }, { "epoch": 0.7158836689038032, "grad_norm": 0.5568630695343018, "learning_rate": 0.00017820470147498455, "loss": 1.3767, "step": 160 }, { "epoch": 0.7606263982102909, "grad_norm": 0.6193764209747314, "learning_rate": 0.00017509747500939928, "loss": 1.3777, "step": 170 }, { "epoch": 0.8053691275167785, "grad_norm": 0.8501898646354675, "learning_rate": 0.00017181431409621644, "loss": 1.3357, "step": 180 }, { "epoch": 0.8053691275167785, "eval_loss": 1.3869752883911133, "eval_runtime": 10.9836, "eval_samples_per_second": 4.188, "eval_steps_per_second": 4.188, "step": 180 }, { "epoch": 0.8501118568232662, "grad_norm": 0.6570385098457336, "learning_rate": 0.00016836291035358375, "loss": 1.3534, "step": 190 }, { "epoch": 0.8948545861297539, "grad_norm": 0.8461380004882812, "learning_rate": 0.0001647513495501749, "loss": 1.3976, "step": 200 }, { "epoch": 0.9395973154362416, "grad_norm": 0.9425082802772522, "learning_rate": 0.000160988092662272, "loss": 1.2788, "step": 210 }, { "epoch": 0.9395973154362416, "eval_loss": 1.3739150762557983, "eval_runtime": 10.967, "eval_samples_per_second": 4.194, "eval_steps_per_second": 4.194, "step": 210 }, { "epoch": 0.9843400447427293, "grad_norm": 1.4384957551956177, "learning_rate": 0.0001570819560518322, "loss": 1.3569, "step": 220 }, { "epoch": 1.029082774049217, "grad_norm": 0.665977418422699, "learning_rate": 0.00015304209081197425, "loss": 1.3895, "step": 230 }, { "epoch": 1.0738255033557047, "grad_norm": 0.736944317817688, "learning_rate": 0.0001488779613282751, "loss": 1.2396, "step": 240 }, { "epoch": 1.0738255033557047, "eval_loss": 1.3777718544006348, "eval_runtime": 10.9845, "eval_samples_per_second": 4.188, "eval_steps_per_second": 4.188, "step": 240 }, { "epoch": 1.1185682326621924, "grad_norm": 0.5746013522148132, "learning_rate": 0.00014459932310610093, "loss": 1.3425, "step": 250 }, { "epoch": 1.1633109619686801, "grad_norm": 0.8226341009140015, "learning_rate": 0.00014021619991591794, "loss": 1.2856, "step": 260 }, { "epoch": 1.2080536912751678, "grad_norm": 1.051847219467163, "learning_rate": 0.00013573886031012584, "loss": 1.2949, "step": 270 }, { "epoch": 1.2080536912751678, "eval_loss": 1.3808656930923462, "eval_runtime": 10.966, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 270 }, { "epoch": 1.2527964205816555, "grad_norm": 0.6205160021781921, "learning_rate": 0.00013117779356642872, "loss": 1.2785, "step": 280 }, { "epoch": 1.2975391498881432, "grad_norm": 1.1361136436462402, "learning_rate": 0.00012654368511410245, "loss": 1.2825, "step": 290 }, { "epoch": 1.342281879194631, "grad_norm": 0.8586457967758179, "learning_rate": 0.00012184739150072821, "loss": 1.337, "step": 300 }, { "epoch": 1.342281879194631, "eval_loss": 1.3791558742523193, "eval_runtime": 12.241, "eval_samples_per_second": 3.758, "eval_steps_per_second": 3.758, "step": 300 }, { "epoch": 1.3870246085011186, "grad_norm": 1.0672543048858643, "learning_rate": 0.00011709991495803915, "loss": 1.1912, "step": 310 }, { "epoch": 1.4317673378076063, "grad_norm": 0.8796883821487427, "learning_rate": 0.0001123123776264656, "loss": 1.1901, "step": 320 }, { "epoch": 1.476510067114094, "grad_norm": 0.9798341989517212, "learning_rate": 0.00010749599549876472, "loss": 1.3266, "step": 330 }, { "epoch": 1.476510067114094, "eval_loss": 1.3774683475494385, "eval_runtime": 10.9641, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 330 }, { "epoch": 1.5212527964205815, "grad_norm": 1.070554494857788, "learning_rate": 0.00010266205214377748, "loss": 1.255, "step": 340 }, { "epoch": 1.5659955257270695, "grad_norm": 0.8746442794799805, "learning_rate": 9.782187227187231e-05, "loss": 1.1981, "step": 350 }, { "epoch": 1.610738255033557, "grad_norm": 1.1643601655960083, "learning_rate": 9.298679520400412e-05, "loss": 1.2735, "step": 360 }, { "epoch": 1.610738255033557, "eval_loss": 1.3743919134140015, "eval_runtime": 10.9721, "eval_samples_per_second": 4.192, "eval_steps_per_second": 4.192, "step": 360 }, { "epoch": 1.6554809843400449, "grad_norm": 1.2565709352493286, "learning_rate": 8.816814830654468e-05, "loss": 1.1877, "step": 370 }, { "epoch": 1.7002237136465324, "grad_norm": 1.3815152645111084, "learning_rate": 8.33772204541195e-05, "loss": 1.2719, "step": 380 }, { "epoch": 1.7449664429530203, "grad_norm": 1.1193474531173706, "learning_rate": 7.862523558262116e-05, "loss": 1.2809, "step": 390 }, { "epoch": 1.7449664429530203, "eval_loss": 1.37523353099823, "eval_runtime": 12.3393, "eval_samples_per_second": 3.728, "eval_steps_per_second": 3.728, "step": 390 }, { "epoch": 1.7897091722595078, "grad_norm": 0.9928935766220093, "learning_rate": 7.392332639435752e-05, "loss": 1.3729, "step": 400 }, { "epoch": 1.8344519015659957, "grad_norm": 1.6183288097381592, "learning_rate": 6.928250827693771e-05, "loss": 1.1143, "step": 410 }, { "epoch": 1.8791946308724832, "grad_norm": 1.1342641115188599, "learning_rate": 6.471365349699636e-05, "loss": 1.2383, "step": 420 }, { "epoch": 1.8791946308724832, "eval_loss": 1.377549648284912, "eval_runtime": 10.9609, "eval_samples_per_second": 4.197, "eval_steps_per_second": 4.197, "step": 420 }, { "epoch": 1.9239373601789709, "grad_norm": 0.9746946096420288, "learning_rate": 6.022746572921447e-05, "loss": 1.1807, "step": 430 }, { "epoch": 1.9686800894854586, "grad_norm": 1.2227619886398315, "learning_rate": 5.583445498030848e-05, "loss": 1.1108, "step": 440 }, { "epoch": 2.0134228187919465, "grad_norm": 1.0090824365615845, "learning_rate": 5.1544912966734994e-05, "loss": 1.2116, "step": 450 }, { "epoch": 2.0134228187919465, "eval_loss": 1.385948657989502, "eval_runtime": 10.9657, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 450 }, { "epoch": 2.058165548098434, "grad_norm": 1.5286532640457153, "learning_rate": 4.7368889003794026e-05, "loss": 1.1827, "step": 460 }, { "epoch": 2.1029082774049215, "grad_norm": 1.072304368019104, "learning_rate": 4.3316166462617355e-05, "loss": 1.2313, "step": 470 }, { "epoch": 2.1476510067114094, "grad_norm": 1.698246717453003, "learning_rate": 3.939623985019679e-05, "loss": 1.0153, "step": 480 }, { "epoch": 2.1476510067114094, "eval_loss": 1.3925849199295044, "eval_runtime": 10.9662, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 480 }, { "epoch": 2.192393736017897, "grad_norm": 1.3146202564239502, "learning_rate": 3.561829256614856e-05, "loss": 1.2576, "step": 490 }, { "epoch": 2.237136465324385, "grad_norm": 1.545972466468811, "learning_rate": 3.199117538832358e-05, "loss": 1.2075, "step": 500 }, { "epoch": 2.2818791946308723, "grad_norm": 1.2800744771957397, "learning_rate": 2.852338573766675e-05, "loss": 1.2039, "step": 510 }, { "epoch": 2.2818791946308723, "eval_loss": 1.3884193897247314, "eval_runtime": 10.9618, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 510 }, { "epoch": 2.3266219239373602, "grad_norm": 1.2421742677688599, "learning_rate": 2.5223047770902274e-05, "loss": 1.298, "step": 520 }, { "epoch": 2.3713646532438477, "grad_norm": 0.9423472285270691, "learning_rate": 2.2097893347683198e-05, "loss": 1.1545, "step": 530 }, { "epoch": 2.4161073825503356, "grad_norm": 1.5044140815734863, "learning_rate": 1.915524391679375e-05, "loss": 1.2451, "step": 540 }, { "epoch": 2.4161073825503356, "eval_loss": 1.388563871383667, "eval_runtime": 10.9674, "eval_samples_per_second": 4.194, "eval_steps_per_second": 4.194, "step": 540 }, { "epoch": 2.460850111856823, "grad_norm": 1.0422102212905884, "learning_rate": 1.6401993363841038e-05, "loss": 0.9668, "step": 550 }, { "epoch": 2.505592841163311, "grad_norm": 1.4864928722381592, "learning_rate": 1.3844591860619383e-05, "loss": 1.1114, "step": 560 }, { "epoch": 2.5503355704697985, "grad_norm": 1.3897795677185059, "learning_rate": 1.148903075398431e-05, "loss": 1.2311, "step": 570 }, { "epoch": 2.5503355704697985, "eval_loss": 1.3921420574188232, "eval_runtime": 11.038, "eval_samples_per_second": 4.167, "eval_steps_per_second": 4.167, "step": 570 }, { "epoch": 2.5950782997762865, "grad_norm": 1.2699692249298096, "learning_rate": 9.340828529637602e-06, "loss": 1.2615, "step": 580 }, { "epoch": 2.639821029082774, "grad_norm": 1.8885409832000732, "learning_rate": 7.405017883706766e-06, "loss": 1.0575, "step": 590 }, { "epoch": 2.684563758389262, "grad_norm": 1.5944920778274536, "learning_rate": 5.686133932407156e-06, "loss": 1.1299, "step": 600 }, { "epoch": 2.684563758389262, "eval_loss": 1.3941093683242798, "eval_runtime": 10.9684, "eval_samples_per_second": 4.194, "eval_steps_per_second": 4.194, "step": 600 }, { "epoch": 2.7293064876957494, "grad_norm": 1.389388918876648, "learning_rate": 4.188203587408146e-06, "loss": 1.1813, "step": 610 }, { "epoch": 2.7740492170022373, "grad_norm": 1.20370614528656, "learning_rate": 2.914736121794559e-06, "loss": 1.1991, "step": 620 }, { "epoch": 2.8187919463087248, "grad_norm": 1.2309448719024658, "learning_rate": 1.868714948724626e-06, "loss": 1.2163, "step": 630 }, { "epoch": 2.8187919463087248, "eval_loss": 1.3913480043411255, "eval_runtime": 10.9641, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 630 } ], "logging_steps": 10, "max_steps": 669, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 90, "total_flos": 1.2861217539072e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }