{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009009009009009009, "grad_norm": 608.0, "learning_rate": 1.801801801801802e-06, "loss": 58.5641, "step": 1 }, { "epoch": 0.04504504504504504, "grad_norm": 532.0, "learning_rate": 9.00900900900901e-06, "loss": 54.6181, "step": 5 }, { "epoch": 0.09009009009009009, "grad_norm": 446.0, "learning_rate": 1.801801801801802e-05, "loss": 50.0236, "step": 10 }, { "epoch": 0.13513513513513514, "grad_norm": 193.0, "learning_rate": 2.702702702702703e-05, "loss": 33.1549, "step": 15 }, { "epoch": 0.18018018018018017, "grad_norm": 44.5, "learning_rate": 3.603603603603604e-05, "loss": 25.2428, "step": 20 }, { "epoch": 0.22522522522522523, "grad_norm": 26.625, "learning_rate": 4.5045045045045046e-05, "loss": 22.4735, "step": 25 }, { "epoch": 0.2702702702702703, "grad_norm": 17.25, "learning_rate": 5.405405405405406e-05, "loss": 20.4661, "step": 30 }, { "epoch": 0.3153153153153153, "grad_norm": 7.6875, "learning_rate": 6.306306306306306e-05, "loss": 19.1401, "step": 35 }, { "epoch": 0.36036036036036034, "grad_norm": 11.6875, "learning_rate": 7.207207207207208e-05, "loss": 18.3188, "step": 40 }, { "epoch": 0.40540540540540543, "grad_norm": 23.5, "learning_rate": 8.108108108108109e-05, "loss": 16.7622, "step": 45 }, { "epoch": 0.45045045045045046, "grad_norm": 56.25, "learning_rate": 9.009009009009009e-05, "loss": 12.6183, "step": 50 }, { "epoch": 0.4954954954954955, "grad_norm": 13.0, "learning_rate": 9.90990990990991e-05, "loss": 4.3593, "step": 55 }, { "epoch": 0.5405405405405406, "grad_norm": 3.875, "learning_rate": 0.00010810810810810812, "loss": 2.18, "step": 60 }, { "epoch": 0.5855855855855856, "grad_norm": 2.421875, "learning_rate": 0.00011711711711711712, "loss": 1.8179, "step": 65 }, { "epoch": 0.6306306306306306, "grad_norm": 3.265625, "learning_rate": 0.00012612612612612612, "loss": 1.5974, "step": 70 }, { "epoch": 0.6756756756756757, "grad_norm": 2.375, "learning_rate": 0.00013513513513513514, "loss": 1.486, "step": 75 }, { "epoch": 0.7207207207207207, "grad_norm": 1.5078125, "learning_rate": 0.00014414414414414415, "loss": 1.361, "step": 80 }, { "epoch": 0.7657657657657657, "grad_norm": 3.890625, "learning_rate": 0.00015315315315315314, "loss": 1.3001, "step": 85 }, { "epoch": 0.8108108108108109, "grad_norm": 4.46875, "learning_rate": 0.00016216216216216218, "loss": 1.261, "step": 90 }, { "epoch": 0.8558558558558559, "grad_norm": 5.03125, "learning_rate": 0.0001711711711711712, "loss": 1.2015, "step": 95 }, { "epoch": 0.9009009009009009, "grad_norm": 32.25, "learning_rate": 0.00018018018018018018, "loss": 1.1886, "step": 100 }, { "epoch": 0.9459459459459459, "grad_norm": 1.703125, "learning_rate": 0.0001891891891891892, "loss": 1.1679, "step": 105 }, { "epoch": 0.990990990990991, "grad_norm": 2.984375, "learning_rate": 0.0001981981981981982, "loss": 1.1572, "step": 110 }, { "epoch": 1.0, "eval_loss": 2.307225465774536, "eval_runtime": 1.0056, "eval_samples_per_second": 4.972, "eval_steps_per_second": 1.989, "step": 111 }, { "epoch": 1.0360360360360361, "grad_norm": 1.546875, "learning_rate": 0.00019999208860571255, "loss": 1.0473, "step": 115 }, { "epoch": 1.0810810810810811, "grad_norm": 1.546875, "learning_rate": 0.0001999599507118322, "loss": 1.0618, "step": 120 }, { "epoch": 1.1261261261261262, "grad_norm": 10.0, "learning_rate": 0.00019990309979553045, "loss": 1.0458, "step": 125 }, { "epoch": 1.1711711711711712, "grad_norm": 8.4375, "learning_rate": 0.00019982154991201608, "loss": 1.0364, "step": 130 }, { "epoch": 1.2162162162162162, "grad_norm": 2.0, "learning_rate": 0.00019971532122280464, "loss": 1.0457, "step": 135 }, { "epoch": 1.2612612612612613, "grad_norm": 1.4453125, "learning_rate": 0.00019958443999073397, "loss": 0.9906, "step": 140 }, { "epoch": 1.3063063063063063, "grad_norm": 18.25, "learning_rate": 0.00019942893857347128, "loss": 0.9911, "step": 145 }, { "epoch": 1.3513513513513513, "grad_norm": 2.578125, "learning_rate": 0.0001992488554155135, "loss": 0.9996, "step": 150 }, { "epoch": 1.3963963963963963, "grad_norm": 1.7734375, "learning_rate": 0.00019904423503868247, "loss": 0.9656, "step": 155 }, { "epoch": 1.4414414414414414, "grad_norm": 5.65625, "learning_rate": 0.00019881512803111796, "loss": 0.9753, "step": 160 }, { "epoch": 1.4864864864864864, "grad_norm": 3.78125, "learning_rate": 0.00019856159103477086, "loss": 0.9239, "step": 165 }, { "epoch": 1.5315315315315314, "grad_norm": 0.86328125, "learning_rate": 0.00019828368673139947, "loss": 0.9428, "step": 170 }, { "epoch": 1.5765765765765765, "grad_norm": 0.7265625, "learning_rate": 0.00019798148382707296, "loss": 0.9455, "step": 175 }, { "epoch": 1.6216216216216215, "grad_norm": 1.8125, "learning_rate": 0.00019765505703518496, "loss": 0.9373, "step": 180 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0078125, "learning_rate": 0.00019730448705798239, "loss": 0.9659, "step": 185 }, { "epoch": 1.7117117117117115, "grad_norm": 3.859375, "learning_rate": 0.00019692986056661356, "loss": 0.9271, "step": 190 }, { "epoch": 1.7567567567567568, "grad_norm": 3.34375, "learning_rate": 0.00019653127017970034, "loss": 0.9303, "step": 195 }, { "epoch": 1.8018018018018018, "grad_norm": 0.91796875, "learning_rate": 0.0001961088144404403, "loss": 0.9333, "step": 200 }, { "epoch": 1.8468468468468469, "grad_norm": 1.4453125, "learning_rate": 0.00019566259779224378, "loss": 0.8923, "step": 205 }, { "epoch": 1.8918918918918919, "grad_norm": 3.171875, "learning_rate": 0.00019519273055291266, "loss": 0.9, "step": 210 }, { "epoch": 1.936936936936937, "grad_norm": 14.3125, "learning_rate": 0.00019469932888736632, "loss": 0.8988, "step": 215 }, { "epoch": 1.981981981981982, "grad_norm": 3.46875, "learning_rate": 0.0001941825147789225, "loss": 0.9296, "step": 220 }, { "epoch": 2.0, "eval_loss": 2.178852081298828, "eval_runtime": 1.0053, "eval_samples_per_second": 4.973, "eval_steps_per_second": 1.989, "step": 222 }, { "epoch": 2.027027027027027, "grad_norm": 1.4921875, "learning_rate": 0.00019364241599913924, "loss": 0.8696, "step": 225 }, { "epoch": 2.0720720720720722, "grad_norm": 2.65625, "learning_rate": 0.0001930791660762262, "loss": 0.8363, "step": 230 }, { "epoch": 2.1171171171171173, "grad_norm": 1.265625, "learning_rate": 0.00019249290426203252, "loss": 0.821, "step": 235 }, { "epoch": 2.1621621621621623, "grad_norm": 2.546875, "learning_rate": 0.00019188377549761963, "loss": 0.8511, "step": 240 }, { "epoch": 2.2072072072072073, "grad_norm": 0.828125, "learning_rate": 0.0001912519303774276, "loss": 0.8231, "step": 245 }, { "epoch": 2.2522522522522523, "grad_norm": 0.73046875, "learning_rate": 0.000190597525112044, "loss": 0.8496, "step": 250 }, { "epoch": 2.2972972972972974, "grad_norm": 1.2421875, "learning_rate": 0.00018992072148958368, "loss": 0.852, "step": 255 }, { "epoch": 2.3423423423423424, "grad_norm": 1.578125, "learning_rate": 0.0001892216868356904, "loss": 0.8131, "step": 260 }, { "epoch": 2.3873873873873874, "grad_norm": 1.5078125, "learning_rate": 0.00018850059397216876, "loss": 0.8483, "step": 265 }, { "epoch": 2.4324324324324325, "grad_norm": 1.125, "learning_rate": 0.00018775762117425777, "loss": 0.8432, "step": 270 }, { "epoch": 2.4774774774774775, "grad_norm": 0.6015625, "learning_rate": 0.00018699295212655596, "loss": 0.8493, "step": 275 }, { "epoch": 2.5225225225225225, "grad_norm": 0.8828125, "learning_rate": 0.00018620677587760916, "loss": 0.7998, "step": 280 }, { "epoch": 2.5675675675675675, "grad_norm": 0.73046875, "learning_rate": 0.0001853992867931721, "loss": 0.8256, "step": 285 }, { "epoch": 2.6126126126126126, "grad_norm": 0.6796875, "learning_rate": 0.00018457068450815562, "loss": 0.8162, "step": 290 }, { "epoch": 2.6576576576576576, "grad_norm": 0.671875, "learning_rate": 0.0001837211738772711, "loss": 0.8338, "step": 295 }, { "epoch": 2.7027027027027026, "grad_norm": 0.9140625, "learning_rate": 0.00018285096492438424, "loss": 0.8279, "step": 300 }, { "epoch": 2.7477477477477477, "grad_norm": 0.60546875, "learning_rate": 0.00018196027279059117, "loss": 0.7962, "step": 305 }, { "epoch": 2.7927927927927927, "grad_norm": 2.78125, "learning_rate": 0.0001810493176810292, "loss": 0.8192, "step": 310 }, { "epoch": 2.8378378378378377, "grad_norm": 0.63671875, "learning_rate": 0.00018011832481043576, "loss": 0.8147, "step": 315 }, { "epoch": 2.8828828828828827, "grad_norm": 0.56640625, "learning_rate": 0.00017916752434746856, "loss": 0.8255, "step": 320 }, { "epoch": 2.9279279279279278, "grad_norm": 1.8046875, "learning_rate": 0.0001781971513578013, "loss": 0.8059, "step": 325 }, { "epoch": 2.972972972972973, "grad_norm": 1.1640625, "learning_rate": 0.00017720744574600863, "loss": 0.8273, "step": 330 }, { "epoch": 3.0, "eval_loss": 2.1709225177764893, "eval_runtime": 1.0054, "eval_samples_per_second": 4.973, "eval_steps_per_second": 1.989, "step": 333 }, { "epoch": 3.018018018018018, "grad_norm": 0.83984375, "learning_rate": 0.00017619865219625452, "loss": 0.7934, "step": 335 }, { "epoch": 3.063063063063063, "grad_norm": 1.5859375, "learning_rate": 0.00017517102011179933, "loss": 0.7096, "step": 340 }, { "epoch": 3.108108108108108, "grad_norm": 0.91796875, "learning_rate": 0.00017412480355334005, "loss": 0.7203, "step": 345 }, { "epoch": 3.153153153153153, "grad_norm": 1.9296875, "learning_rate": 0.00017306026117619889, "loss": 0.7237, "step": 350 }, { "epoch": 3.1981981981981984, "grad_norm": 1.6328125, "learning_rate": 0.00017197765616637636, "loss": 0.738, "step": 355 }, { "epoch": 3.2432432432432434, "grad_norm": 2.0625, "learning_rate": 0.00017087725617548385, "loss": 0.7214, "step": 360 }, { "epoch": 3.2882882882882885, "grad_norm": 3.53125, "learning_rate": 0.0001697593332545723, "loss": 0.7549, "step": 365 }, { "epoch": 3.3333333333333335, "grad_norm": 4.25, "learning_rate": 0.0001686241637868734, "loss": 0.7575, "step": 370 }, { "epoch": 3.3783783783783785, "grad_norm": 1.6484375, "learning_rate": 0.00016747202841946928, "loss": 0.7392, "step": 375 }, { "epoch": 3.4234234234234235, "grad_norm": 1.6171875, "learning_rate": 0.00016630321199390867, "loss": 0.7251, "step": 380 }, { "epoch": 3.4684684684684686, "grad_norm": 1.5546875, "learning_rate": 0.0001651180034757856, "loss": 0.7285, "step": 385 }, { "epoch": 3.5135135135135136, "grad_norm": 1.0078125, "learning_rate": 0.0001639166958832985, "loss": 0.7114, "step": 390 }, { "epoch": 3.5585585585585586, "grad_norm": 1.2421875, "learning_rate": 0.00016269958621480788, "loss": 0.7223, "step": 395 }, { "epoch": 3.6036036036036037, "grad_norm": 0.61328125, "learning_rate": 0.00016146697537540924, "loss": 0.7273, "step": 400 }, { "epoch": 3.6486486486486487, "grad_norm": 0.67578125, "learning_rate": 0.00016021916810254097, "loss": 0.7328, "step": 405 }, { "epoch": 3.6936936936936937, "grad_norm": 1.1015625, "learning_rate": 0.00015895647289064396, "loss": 0.7409, "step": 410 }, { "epoch": 3.7387387387387387, "grad_norm": 0.7578125, "learning_rate": 0.000157679201914893, "loss": 0.7247, "step": 415 }, { "epoch": 3.7837837837837838, "grad_norm": 1.890625, "learning_rate": 0.0001563876709540178, "loss": 0.7446, "step": 420 }, { "epoch": 3.828828828828829, "grad_norm": 0.7109375, "learning_rate": 0.0001550821993122334, "loss": 0.7421, "step": 425 }, { "epoch": 3.873873873873874, "grad_norm": 0.73046875, "learning_rate": 0.00015376310974029873, "loss": 0.7362, "step": 430 }, { "epoch": 3.918918918918919, "grad_norm": 0.66015625, "learning_rate": 0.00015243072835572318, "loss": 0.7398, "step": 435 }, { "epoch": 3.963963963963964, "grad_norm": 0.69921875, "learning_rate": 0.0001510853845621409, "loss": 0.7586, "step": 440 }, { "epoch": 4.0, "eval_loss": 2.2163968086242676, "eval_runtime": 1.0061, "eval_samples_per_second": 4.97, "eval_steps_per_second": 1.988, "step": 444 }, { "epoch": 4.009009009009009, "grad_norm": 0.58203125, "learning_rate": 0.00014972741096787242, "loss": 0.7128, "step": 445 }, { "epoch": 4.054054054054054, "grad_norm": 0.75, "learning_rate": 0.00014835714330369446, "loss": 0.6463, "step": 450 }, { "epoch": 4.099099099099099, "grad_norm": 0.83203125, "learning_rate": 0.00014697492033983707, "loss": 0.6453, "step": 455 }, { "epoch": 4.1441441441441444, "grad_norm": 0.55859375, "learning_rate": 0.00014558108380223012, "loss": 0.647, "step": 460 }, { "epoch": 4.1891891891891895, "grad_norm": 1.3125, "learning_rate": 0.00014417597828801832, "loss": 0.626, "step": 465 }, { "epoch": 4.2342342342342345, "grad_norm": 0.85546875, "learning_rate": 0.00014275995118036693, "loss": 0.6334, "step": 470 }, { "epoch": 4.2792792792792795, "grad_norm": 0.69921875, "learning_rate": 0.0001413333525625784, "loss": 0.6435, "step": 475 }, { "epoch": 4.324324324324325, "grad_norm": 0.8046875, "learning_rate": 0.00013989653513154165, "loss": 0.6439, "step": 480 }, { "epoch": 4.36936936936937, "grad_norm": 1.0859375, "learning_rate": 0.00013844985411053492, "loss": 0.6559, "step": 485 }, { "epoch": 4.414414414414415, "grad_norm": 1.3359375, "learning_rate": 0.00013699366716140435, "loss": 0.6654, "step": 490 }, { "epoch": 4.45945945945946, "grad_norm": 0.80859375, "learning_rate": 0.00013552833429613938, "loss": 0.6783, "step": 495 }, { "epoch": 4.504504504504505, "grad_norm": 0.6875, "learning_rate": 0.00013405421778786737, "loss": 0.6543, "step": 500 }, { "epoch": 4.54954954954955, "grad_norm": 0.62890625, "learning_rate": 0.00013257168208128908, "loss": 0.6608, "step": 505 }, { "epoch": 4.594594594594595, "grad_norm": 0.60546875, "learning_rate": 0.00013108109370257712, "loss": 0.6621, "step": 510 }, { "epoch": 4.63963963963964, "grad_norm": 0.67578125, "learning_rate": 0.00012958282116876026, "loss": 0.656, "step": 515 }, { "epoch": 4.684684684684685, "grad_norm": 0.65234375, "learning_rate": 0.00012807723489661495, "loss": 0.6505, "step": 520 }, { "epoch": 4.72972972972973, "grad_norm": 0.921875, "learning_rate": 0.00012656470711108764, "loss": 0.6789, "step": 525 }, { "epoch": 4.774774774774775, "grad_norm": 0.61328125, "learning_rate": 0.00012504561175326985, "loss": 0.6588, "step": 530 }, { "epoch": 4.81981981981982, "grad_norm": 0.703125, "learning_rate": 0.00012352032438794902, "loss": 0.6534, "step": 535 }, { "epoch": 4.864864864864865, "grad_norm": 0.74609375, "learning_rate": 0.00012198922211075778, "loss": 0.6482, "step": 540 }, { "epoch": 4.90990990990991, "grad_norm": 0.94140625, "learning_rate": 0.00012045268345494511, "loss": 0.6595, "step": 545 }, { "epoch": 4.954954954954955, "grad_norm": 0.59765625, "learning_rate": 0.00011891108829779165, "loss": 0.6624, "step": 550 }, { "epoch": 5.0, "grad_norm": 0.578125, "learning_rate": 0.00011736481776669306, "loss": 0.6613, "step": 555 }, { "epoch": 5.0, "eval_loss": 2.3182225227355957, "eval_runtime": 1.0028, "eval_samples_per_second": 4.986, "eval_steps_per_second": 1.994, "step": 555 }, { "epoch": 5.045045045045045, "grad_norm": 0.98046875, "learning_rate": 0.0001158142541449341, "loss": 0.5564, "step": 560 }, { "epoch": 5.09009009009009, "grad_norm": 0.69140625, "learning_rate": 0.00011425978077717709, "loss": 0.5273, "step": 565 }, { "epoch": 5.135135135135135, "grad_norm": 0.69921875, "learning_rate": 0.00011270178197468789, "loss": 0.5589, "step": 570 }, { "epoch": 5.18018018018018, "grad_norm": 0.6171875, "learning_rate": 0.00011114064292032282, "loss": 0.5593, "step": 575 }, { "epoch": 5.225225225225225, "grad_norm": 0.69921875, "learning_rate": 0.00010957674957330042, "loss": 0.5672, "step": 580 }, { "epoch": 5.27027027027027, "grad_norm": 0.69140625, "learning_rate": 0.00010801048857378071, "loss": 0.5444, "step": 585 }, { "epoch": 5.315315315315315, "grad_norm": 0.66796875, "learning_rate": 0.00010644224714727681, "loss": 0.5747, "step": 590 }, { "epoch": 5.36036036036036, "grad_norm": 0.68359375, "learning_rate": 0.0001048724130089212, "loss": 0.5609, "step": 595 }, { "epoch": 5.405405405405405, "grad_norm": 0.8984375, "learning_rate": 0.00010330137426761135, "loss": 0.5625, "step": 600 }, { "epoch": 5.45045045045045, "grad_norm": 0.76171875, "learning_rate": 0.00010172951933005775, "loss": 0.5671, "step": 605 }, { "epoch": 5.495495495495495, "grad_norm": 0.80859375, "learning_rate": 0.00010015723680475846, "loss": 0.564, "step": 610 }, { "epoch": 5.54054054054054, "grad_norm": 0.76171875, "learning_rate": 9.858491540592382e-05, "loss": 0.5784, "step": 615 }, { "epoch": 5.585585585585585, "grad_norm": 0.7265625, "learning_rate": 9.70129438573747e-05, "loss": 0.5672, "step": 620 }, { "epoch": 5.63063063063063, "grad_norm": 0.75390625, "learning_rate": 9.54417107964389e-05, "loss": 0.5592, "step": 625 }, { "epoch": 5.675675675675675, "grad_norm": 0.734375, "learning_rate": 9.38716046778684e-05, "loss": 0.5634, "step": 630 }, { "epoch": 5.7207207207207205, "grad_norm": 0.6640625, "learning_rate": 9.230301367780208e-05, "loss": 0.5691, "step": 635 }, { "epoch": 5.7657657657657655, "grad_norm": 0.6875, "learning_rate": 9.07363255977973e-05, "loss": 0.5722, "step": 640 }, { "epoch": 5.8108108108108105, "grad_norm": 0.76953125, "learning_rate": 8.917192776895382e-05, "loss": 0.5827, "step": 645 }, { "epoch": 5.8558558558558556, "grad_norm": 0.83203125, "learning_rate": 8.76102069561545e-05, "loss": 0.5745, "step": 650 }, { "epoch": 5.900900900900901, "grad_norm": 0.7265625, "learning_rate": 8.605154926244543e-05, "loss": 0.5614, "step": 655 }, { "epoch": 5.945945945945946, "grad_norm": 0.65625, "learning_rate": 8.449634003358022e-05, "loss": 0.5731, "step": 660 }, { "epoch": 5.990990990990991, "grad_norm": 0.8828125, "learning_rate": 8.294496376275104e-05, "loss": 0.577, "step": 665 }, { "epoch": 6.0, "eval_loss": 2.4773526191711426, "eval_runtime": 1.0034, "eval_samples_per_second": 4.983, "eval_steps_per_second": 1.993, "step": 666 }, { "epoch": 6.036036036036036, "grad_norm": 0.8984375, "learning_rate": 8.13978039955308e-05, "loss": 0.5142, "step": 670 }, { "epoch": 6.081081081081081, "grad_norm": 0.8359375, "learning_rate": 7.985524323504948e-05, "loss": 0.4725, "step": 675 }, { "epoch": 6.126126126126126, "grad_norm": 0.7734375, "learning_rate": 7.831766284742807e-05, "loss": 0.4671, "step": 680 }, { "epoch": 6.171171171171171, "grad_norm": 0.7578125, "learning_rate": 7.678544296749384e-05, "loss": 0.4804, "step": 685 }, { "epoch": 6.216216216216216, "grad_norm": 0.82421875, "learning_rate": 7.525896240479976e-05, "loss": 0.4704, "step": 690 }, { "epoch": 6.261261261261261, "grad_norm": 0.75, "learning_rate": 7.37385985499718e-05, "loss": 0.4659, "step": 695 }, { "epoch": 6.306306306306306, "grad_norm": 0.71484375, "learning_rate": 7.222472728140695e-05, "loss": 0.4697, "step": 700 }, { "epoch": 6.351351351351352, "grad_norm": 0.79296875, "learning_rate": 7.071772287234497e-05, "loss": 0.4912, "step": 705 }, { "epoch": 6.396396396396397, "grad_norm": 0.76953125, "learning_rate": 6.921795789833723e-05, "loss": 0.4689, "step": 710 }, { "epoch": 6.441441441441442, "grad_norm": 0.66796875, "learning_rate": 6.772580314513508e-05, "loss": 0.4753, "step": 715 }, { "epoch": 6.486486486486487, "grad_norm": 0.75, "learning_rate": 6.624162751702076e-05, "loss": 0.4759, "step": 720 }, { "epoch": 6.531531531531532, "grad_norm": 0.70703125, "learning_rate": 6.476579794560356e-05, "loss": 0.489, "step": 725 }, { "epoch": 6.576576576576577, "grad_norm": 0.7265625, "learning_rate": 6.329867929910347e-05, "loss": 0.473, "step": 730 }, { "epoch": 6.621621621621622, "grad_norm": 0.7109375, "learning_rate": 6.184063429214515e-05, "loss": 0.4793, "step": 735 }, { "epoch": 6.666666666666667, "grad_norm": 0.76171875, "learning_rate": 6.039202339608432e-05, "loss": 0.5071, "step": 740 }, { "epoch": 6.711711711711712, "grad_norm": 0.69921875, "learning_rate": 5.895320474988864e-05, "loss": 0.4741, "step": 745 }, { "epoch": 6.756756756756757, "grad_norm": 0.69921875, "learning_rate": 5.752453407159522e-05, "loss": 0.4799, "step": 750 }, { "epoch": 6.801801801801802, "grad_norm": 0.7578125, "learning_rate": 5.610636457036693e-05, "loss": 0.4901, "step": 755 }, { "epoch": 6.846846846846847, "grad_norm": 0.6953125, "learning_rate": 5.469904685916861e-05, "loss": 0.4858, "step": 760 }, { "epoch": 6.891891891891892, "grad_norm": 0.76953125, "learning_rate": 5.33029288680852e-05, "loss": 0.4895, "step": 765 }, { "epoch": 6.936936936936937, "grad_norm": 0.70703125, "learning_rate": 5.191835575830352e-05, "loss": 0.4935, "step": 770 }, { "epoch": 6.981981981981982, "grad_norm": 0.69921875, "learning_rate": 5.0545669836778144e-05, "loss": 0.4958, "step": 775 }, { "epoch": 7.0, "eval_loss": 2.7035882472991943, "eval_runtime": 1.0058, "eval_samples_per_second": 4.971, "eval_steps_per_second": 1.988, "step": 777 }, { "epoch": 7.027027027027027, "grad_norm": 0.6875, "learning_rate": 4.918521047160308e-05, "loss": 0.4443, "step": 780 }, { "epoch": 7.072072072072072, "grad_norm": 0.7734375, "learning_rate": 4.783731400811022e-05, "loss": 0.4139, "step": 785 }, { "epoch": 7.117117117117117, "grad_norm": 0.734375, "learning_rate": 4.650231368571486e-05, "loss": 0.41, "step": 790 }, { "epoch": 7.162162162162162, "grad_norm": 0.90625, "learning_rate": 4.518053955552903e-05, "loss": 0.4291, "step": 795 }, { "epoch": 7.207207207207207, "grad_norm": 0.71875, "learning_rate": 4.387231839876349e-05, "loss": 0.4141, "step": 800 }, { "epoch": 7.252252252252252, "grad_norm": 0.7265625, "learning_rate": 4.2577973645937674e-05, "loss": 0.4139, "step": 805 }, { "epoch": 7.297297297297297, "grad_norm": 0.76171875, "learning_rate": 4.129782529691815e-05, "loss": 0.4278, "step": 810 }, { "epoch": 7.342342342342342, "grad_norm": 0.73046875, "learning_rate": 4.003218984180552e-05, "loss": 0.4148, "step": 815 }, { "epoch": 7.387387387387387, "grad_norm": 0.79296875, "learning_rate": 3.878138018268866e-05, "loss": 0.4168, "step": 820 }, { "epoch": 7.4324324324324325, "grad_norm": 0.82421875, "learning_rate": 3.7545705556286126e-05, "loss": 0.4182, "step": 825 }, { "epoch": 7.4774774774774775, "grad_norm": 0.70703125, "learning_rate": 3.632547145749395e-05, "loss": 0.4239, "step": 830 }, { "epoch": 7.5225225225225225, "grad_norm": 0.78515625, "learning_rate": 3.5120979563858266e-05, "loss": 0.4137, "step": 835 }, { "epoch": 7.5675675675675675, "grad_norm": 0.73828125, "learning_rate": 3.393252766099187e-05, "loss": 0.4111, "step": 840 }, { "epoch": 7.612612612612613, "grad_norm": 0.7421875, "learning_rate": 3.2760409568952766e-05, "loss": 0.4179, "step": 845 }, { "epoch": 7.657657657657658, "grad_norm": 0.76171875, "learning_rate": 3.1604915069603436e-05, "loss": 0.429, "step": 850 }, { "epoch": 7.702702702702703, "grad_norm": 0.75, "learning_rate": 3.0466329834968233e-05, "loss": 0.4118, "step": 855 }, { "epoch": 7.747747747747748, "grad_norm": 0.71484375, "learning_rate": 2.9344935356606773e-05, "loss": 0.4049, "step": 860 }, { "epoch": 7.792792792792793, "grad_norm": 0.74609375, "learning_rate": 2.8241008876021215e-05, "loss": 0.413, "step": 865 }, { "epoch": 7.837837837837838, "grad_norm": 0.72265625, "learning_rate": 2.7154823316113932e-05, "loss": 0.4071, "step": 870 }, { "epoch": 7.882882882882883, "grad_norm": 0.734375, "learning_rate": 2.60866472137129e-05, "loss": 0.4073, "step": 875 }, { "epoch": 7.927927927927928, "grad_norm": 0.71875, "learning_rate": 2.5036744653181753e-05, "loss": 0.4124, "step": 880 }, { "epoch": 7.972972972972973, "grad_norm": 0.7578125, "learning_rate": 2.4005375201130274e-05, "loss": 0.4205, "step": 885 }, { "epoch": 8.0, "eval_loss": 2.9689488410949707, "eval_runtime": 1.0053, "eval_samples_per_second": 4.973, "eval_steps_per_second": 1.989, "step": 888 }, { "epoch": 8.018018018018019, "grad_norm": 0.68359375, "learning_rate": 2.29927938422419e-05, "loss": 0.4012, "step": 890 }, { "epoch": 8.063063063063064, "grad_norm": 1.0078125, "learning_rate": 2.199925091623418e-05, "loss": 0.3781, "step": 895 }, { "epoch": 8.108108108108109, "grad_norm": 0.8671875, "learning_rate": 2.102499205596743e-05, "loss": 0.3809, "step": 900 }, { "epoch": 8.153153153153154, "grad_norm": 0.70703125, "learning_rate": 2.0070258126717e-05, "loss": 0.3699, "step": 905 }, { "epoch": 8.198198198198199, "grad_norm": 0.6875, "learning_rate": 1.913528516662452e-05, "loss": 0.3742, "step": 910 }, { "epoch": 8.243243243243244, "grad_norm": 0.70703125, "learning_rate": 1.8220304328342252e-05, "loss": 0.378, "step": 915 }, { "epoch": 8.288288288288289, "grad_norm": 0.70703125, "learning_rate": 1.7325541821885384e-05, "loss": 0.3842, "step": 920 }, { "epoch": 8.333333333333334, "grad_norm": 0.75390625, "learning_rate": 1.6451218858706374e-05, "loss": 0.3894, "step": 925 }, { "epoch": 8.378378378378379, "grad_norm": 0.71875, "learning_rate": 1.5597551597004966e-05, "loss": 0.3758, "step": 930 }, { "epoch": 8.423423423423424, "grad_norm": 0.671875, "learning_rate": 1.476475108828762e-05, "loss": 0.3717, "step": 935 }, { "epoch": 8.468468468468469, "grad_norm": 0.703125, "learning_rate": 1.3953023225189243e-05, "loss": 0.3771, "step": 940 }, { "epoch": 8.513513513513514, "grad_norm": 0.71875, "learning_rate": 1.3162568690570743e-05, "loss": 0.3759, "step": 945 }, { "epoch": 8.558558558558559, "grad_norm": 0.74609375, "learning_rate": 1.23935829079042e-05, "loss": 0.3786, "step": 950 }, { "epoch": 8.603603603603604, "grad_norm": 0.7109375, "learning_rate": 1.1646255992958466e-05, "loss": 0.3734, "step": 955 }, { "epoch": 8.64864864864865, "grad_norm": 0.7265625, "learning_rate": 1.0920772706797167e-05, "loss": 0.3809, "step": 960 }, { "epoch": 8.693693693693694, "grad_norm": 0.7109375, "learning_rate": 1.0217312410100089e-05, "loss": 0.3767, "step": 965 }, { "epoch": 8.73873873873874, "grad_norm": 0.68359375, "learning_rate": 9.536049018820192e-06, "loss": 0.3786, "step": 970 }, { "epoch": 8.783783783783784, "grad_norm": 0.71875, "learning_rate": 8.87715096118642e-06, "loss": 0.3786, "step": 975 }, { "epoch": 8.82882882882883, "grad_norm": 0.74609375, "learning_rate": 8.240781136063346e-06, "loss": 0.3868, "step": 980 }, { "epoch": 8.873873873873874, "grad_norm": 0.72265625, "learning_rate": 7.6270968726777414e-06, "loss": 0.3767, "step": 985 }, { "epoch": 8.91891891891892, "grad_norm": 0.7578125, "learning_rate": 7.03624989172228e-06, "loss": 0.3791, "step": 990 }, { "epoch": 8.963963963963964, "grad_norm": 0.71875, "learning_rate": 6.468386267845717e-06, "loss": 0.382, "step": 995 }, { "epoch": 9.0, "eval_loss": 3.2251663208007812, "eval_runtime": 1.0069, "eval_samples_per_second": 4.966, "eval_steps_per_second": 1.986, "step": 999 }, { "epoch": 9.00900900900901, "grad_norm": 0.71484375, "learning_rate": 5.9236463935389065e-06, "loss": 0.3794, "step": 1000 }, { "epoch": 9.054054054054054, "grad_norm": 0.671875, "learning_rate": 5.402164944425758e-06, "loss": 0.3777, "step": 1005 }, { "epoch": 9.0990990990991, "grad_norm": 0.734375, "learning_rate": 4.904070845967468e-06, "loss": 0.3779, "step": 1010 }, { "epoch": 9.144144144144144, "grad_norm": 0.703125, "learning_rate": 4.429487241588304e-06, "loss": 0.3744, "step": 1015 }, { "epoch": 9.18918918918919, "grad_norm": 0.6953125, "learning_rate": 3.9785314622310495e-06, "loss": 0.3694, "step": 1020 }, { "epoch": 9.234234234234235, "grad_norm": 0.71875, "learning_rate": 3.5513149973492976e-06, "loss": 0.3751, "step": 1025 }, { "epoch": 9.27927927927928, "grad_norm": 0.7265625, "learning_rate": 3.1479434673440167e-06, "loss": 0.3685, "step": 1030 }, { "epoch": 9.324324324324325, "grad_norm": 0.68359375, "learning_rate": 2.7685165974510986e-06, "loss": 0.3653, "step": 1035 }, { "epoch": 9.36936936936937, "grad_norm": 0.71484375, "learning_rate": 2.4131281930864002e-06, "loss": 0.3728, "step": 1040 }, { "epoch": 9.414414414414415, "grad_norm": 0.734375, "learning_rate": 2.0818661166542074e-06, "loss": 0.3693, "step": 1045 }, { "epoch": 9.45945945945946, "grad_norm": 0.6875, "learning_rate": 1.7748122658251876e-06, "loss": 0.3764, "step": 1050 }, { "epoch": 9.504504504504505, "grad_norm": 0.7265625, "learning_rate": 1.4920425532888526e-06, "loss": 0.3654, "step": 1055 }, { "epoch": 9.54954954954955, "grad_norm": 0.66796875, "learning_rate": 1.2336268879856727e-06, "loss": 0.3747, "step": 1060 }, { "epoch": 9.594594594594595, "grad_norm": 0.69140625, "learning_rate": 9.996291578236228e-07, "loss": 0.3711, "step": 1065 }, { "epoch": 9.63963963963964, "grad_norm": 0.71484375, "learning_rate": 7.901072138831511e-07, "loss": 0.3722, "step": 1070 }, { "epoch": 9.684684684684685, "grad_norm": 0.7109375, "learning_rate": 6.051128561147756e-07, "loss": 0.3612, "step": 1075 }, { "epoch": 9.72972972972973, "grad_norm": 0.74609375, "learning_rate": 4.44691820532539e-07, "loss": 0.3647, "step": 1080 }, { "epoch": 9.774774774774775, "grad_norm": 0.6875, "learning_rate": 3.0888376790679795e-07, "loss": 0.3672, "step": 1085 }, { "epoch": 9.81981981981982, "grad_norm": 0.6484375, "learning_rate": 1.977222739588891e-07, "loss": 0.3659, "step": 1090 }, { "epoch": 9.864864864864865, "grad_norm": 0.67578125, "learning_rate": 1.1123482106021322e-07, "loss": 0.3692, "step": 1095 }, { "epoch": 9.90990990990991, "grad_norm": 0.6875, "learning_rate": 4.9442791437848136e-08, "loss": 0.3663, "step": 1100 }, { "epoch": 9.954954954954955, "grad_norm": 0.6875, "learning_rate": 1.2361461888166226e-08, "loss": 0.3673, "step": 1105 }, { "epoch": 10.0, "grad_norm": 0.6640625, "learning_rate": 0.0, "loss": 0.372, "step": 1110 }, { "epoch": 10.0, "eval_loss": 3.242992401123047, "eval_runtime": 1.0031, "eval_samples_per_second": 4.984, "eval_steps_per_second": 1.994, "step": 1110 }, { "epoch": 10.0, "step": 1110, "total_flos": 1.697049221804327e+18, "train_loss": 1.8630313719715084, "train_runtime": 9058.6901, "train_samples_per_second": 1.957, "train_steps_per_second": 0.123 } ], "logging_steps": 5, "max_steps": 1110, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.697049221804327e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }