{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 141420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07071135624381275, "grad_norm": 1.5066859722137451, "learning_rate": 3.125e-05, "loss": 4.9729, "step": 1000 }, { "epoch": 0.1414227124876255, "grad_norm": 0.9705550670623779, "learning_rate": 6.25e-05, "loss": 3.374, "step": 2000 }, { "epoch": 0.21213406873143828, "grad_norm": 0.8380181789398193, "learning_rate": 9.375e-05, "loss": 3.0791, "step": 3000 }, { "epoch": 0.282845424975251, "grad_norm": 0.7704618573188782, "learning_rate": 0.000125, "loss": 2.9038, "step": 4000 }, { "epoch": 0.3535567812190638, "grad_norm": 0.675819993019104, "learning_rate": 0.00015625, "loss": 2.7896, "step": 5000 }, { "epoch": 0.42426813746287656, "grad_norm": 0.6245794892311096, "learning_rate": 0.0001875, "loss": 2.6919, "step": 6000 }, { "epoch": 0.4949794937066893, "grad_norm": 0.5709772109985352, "learning_rate": 0.00021875, "loss": 2.6272, "step": 7000 }, { "epoch": 0.565690849950502, "grad_norm": 0.5335475206375122, "learning_rate": 0.00025, "loss": 2.5715, "step": 8000 }, { "epoch": 0.6364022061943148, "grad_norm": 0.4924204349517822, "learning_rate": 0.00028125000000000003, "loss": 2.5384, "step": 9000 }, { "epoch": 0.7071135624381276, "grad_norm": 0.49077439308166504, "learning_rate": 0.0003125, "loss": 2.5154, "step": 10000 }, { "epoch": 0.7778249186819404, "grad_norm": 0.4304497241973877, "learning_rate": 0.00034371875, "loss": 2.4784, "step": 11000 }, { "epoch": 0.8485362749257531, "grad_norm": 0.45384302735328674, "learning_rate": 0.00037496875000000003, "loss": 2.458, "step": 12000 }, { "epoch": 0.9192476311695659, "grad_norm": 0.3978016674518585, "learning_rate": 0.0004061875, "loss": 2.4536, "step": 13000 }, { "epoch": 0.9899589874133786, "grad_norm": 0.3981296718120575, "learning_rate": 0.0004374375, "loss": 2.4397, "step": 14000 }, { "epoch": 1.0, "eval_accuracy": 0.49545158536152034, "eval_loss": 2.6684181690216064, "eval_runtime": 101.2526, "eval_samples_per_second": 462.882, "eval_steps_per_second": 7.239, "step": 14142 }, { "epoch": 1.0606703436571914, "grad_norm": 0.36285269260406494, "learning_rate": 0.00046865625, "loss": 2.3887, "step": 15000 }, { "epoch": 1.131381699901004, "grad_norm": 0.3568965494632721, "learning_rate": 0.00049990625, "loss": 2.3848, "step": 16000 }, { "epoch": 1.2020930561448169, "grad_norm": 0.32918983697891235, "learning_rate": 0.00053109375, "loss": 2.3763, "step": 17000 }, { "epoch": 1.2728044123886297, "grad_norm": 0.3192691206932068, "learning_rate": 0.00056234375, "loss": 2.3757, "step": 18000 }, { "epoch": 1.3435157686324424, "grad_norm": 0.29217350482940674, "learning_rate": 0.00059359375, "loss": 2.3728, "step": 19000 }, { "epoch": 1.414227124876255, "grad_norm": 0.2726396918296814, "learning_rate": 0.0006248437500000001, "loss": 2.3482, "step": 20000 }, { "epoch": 1.4849384811200679, "grad_norm": 0.2647142708301544, "learning_rate": 0.0006560625, "loss": 2.361, "step": 21000 }, { "epoch": 1.5556498373638807, "grad_norm": 0.24640022218227386, "learning_rate": 0.00068728125, "loss": 2.3414, "step": 22000 }, { "epoch": 1.6263611936076934, "grad_norm": 0.2376652956008911, "learning_rate": 0.00071853125, "loss": 2.3469, "step": 23000 }, { "epoch": 1.697072549851506, "grad_norm": 0.20667687058448792, "learning_rate": 0.00074978125, "loss": 2.3334, "step": 24000 }, { "epoch": 1.7677839060953189, "grad_norm": 0.21862906217575073, "learning_rate": 0.0007810312499999999, "loss": 2.325, "step": 25000 }, { "epoch": 1.8384952623391317, "grad_norm": 0.19700638949871063, "learning_rate": 0.00081225, "loss": 2.3169, "step": 26000 }, { "epoch": 1.9092066185829444, "grad_norm": 0.19530941545963287, "learning_rate": 0.00084346875, "loss": 2.3085, "step": 27000 }, { "epoch": 1.979917974826757, "grad_norm": 0.18496540188789368, "learning_rate": 0.00087471875, "loss": 2.3085, "step": 28000 }, { "epoch": 2.0, "eval_accuracy": 0.5092843747934841, "eval_loss": 2.5420279502868652, "eval_runtime": 102.0041, "eval_samples_per_second": 459.472, "eval_steps_per_second": 7.186, "step": 28284 }, { "epoch": 2.05062933107057, "grad_norm": 0.18070034682750702, "learning_rate": 0.00090596875, "loss": 2.2557, "step": 29000 }, { "epoch": 2.1213406873143827, "grad_norm": 0.17614798247814178, "learning_rate": 0.0009371875, "loss": 2.248, "step": 30000 }, { "epoch": 2.1920520435581956, "grad_norm": 0.18162938952445984, "learning_rate": 0.0009684375, "loss": 2.246, "step": 31000 }, { "epoch": 2.262763399802008, "grad_norm": 0.16680462658405304, "learning_rate": 0.0009996875, "loss": 2.2398, "step": 32000 }, { "epoch": 2.333474756045821, "grad_norm": 0.17343448102474213, "learning_rate": 0.0009909614330104186, "loss": 2.2332, "step": 33000 }, { "epoch": 2.4041861122896337, "grad_norm": 0.15368333458900452, "learning_rate": 0.0009818223359532078, "loss": 2.2367, "step": 34000 }, { "epoch": 2.4748974685334466, "grad_norm": 0.14444677531719208, "learning_rate": 0.0009726832388959971, "loss": 2.2277, "step": 35000 }, { "epoch": 2.5456088247772595, "grad_norm": 0.16958372294902802, "learning_rate": 0.0009635441418387864, "loss": 2.2136, "step": 36000 }, { "epoch": 2.616320181021072, "grad_norm": 0.15171754360198975, "learning_rate": 0.0009544141838786328, "loss": 2.2105, "step": 37000 }, { "epoch": 2.6870315372648848, "grad_norm": 0.13588131964206696, "learning_rate": 0.0009452750868214221, "loss": 2.2056, "step": 38000 }, { "epoch": 2.7577428935086976, "grad_norm": 0.13553854823112488, "learning_rate": 0.0009361359897642113, "loss": 2.1988, "step": 39000 }, { "epoch": 2.82845424975251, "grad_norm": 0.15744280815124512, "learning_rate": 0.0009269968927070006, "loss": 2.1949, "step": 40000 }, { "epoch": 2.899165605996323, "grad_norm": 0.1427813470363617, "learning_rate": 0.000917866934746847, "loss": 2.1875, "step": 41000 }, { "epoch": 2.9698769622401358, "grad_norm": 0.14179003238677979, "learning_rate": 0.0009087278376896363, "loss": 2.19, "step": 42000 }, { "epoch": 3.0, "eval_accuracy": 0.5214661161125094, "eval_loss": 2.439739942550659, "eval_runtime": 102.0725, "eval_samples_per_second": 459.164, "eval_steps_per_second": 7.181, "step": 42426 }, { "epoch": 3.0405883184839486, "grad_norm": 0.1600356101989746, "learning_rate": 0.0008995978797294828, "loss": 2.13, "step": 43000 }, { "epoch": 3.1112996747277615, "grad_norm": 0.16733036935329437, "learning_rate": 0.0008904587826722719, "loss": 2.0964, "step": 44000 }, { "epoch": 3.182011030971574, "grad_norm": 0.15149937570095062, "learning_rate": 0.0008813379638091756, "loss": 2.0964, "step": 45000 }, { "epoch": 3.2527223872153868, "grad_norm": 0.1375265121459961, "learning_rate": 0.0008721988667519649, "loss": 2.1021, "step": 46000 }, { "epoch": 3.3234337434591996, "grad_norm": 0.13642068207263947, "learning_rate": 0.0008630597696947542, "loss": 2.1062, "step": 47000 }, { "epoch": 3.3941450997030125, "grad_norm": 0.15942348539829254, "learning_rate": 0.0008539206726375435, "loss": 2.0943, "step": 48000 }, { "epoch": 3.464856455946825, "grad_norm": 0.14231225848197937, "learning_rate": 0.0008447815755803326, "loss": 2.0968, "step": 49000 }, { "epoch": 3.5355678121906378, "grad_norm": 0.13483628630638123, "learning_rate": 0.0008356516176201791, "loss": 2.0923, "step": 50000 }, { "epoch": 3.6062791684344506, "grad_norm": 0.15377779304981232, "learning_rate": 0.0008265125205629684, "loss": 2.0929, "step": 51000 }, { "epoch": 3.6769905246782635, "grad_norm": 0.13733841478824615, "learning_rate": 0.0008173825626028149, "loss": 2.0929, "step": 52000 }, { "epoch": 3.747701880922076, "grad_norm": 0.13640180230140686, "learning_rate": 0.0008082434655456042, "loss": 2.0938, "step": 53000 }, { "epoch": 3.8184132371658888, "grad_norm": 0.13909070193767548, "learning_rate": 0.0007991135075854505, "loss": 2.0907, "step": 54000 }, { "epoch": 3.8891245934097016, "grad_norm": 0.1521981954574585, "learning_rate": 0.0007899744105282398, "loss": 2.0816, "step": 55000 }, { "epoch": 3.9598359496535145, "grad_norm": 0.12255113571882248, "learning_rate": 0.0007808444525680864, "loss": 2.0865, "step": 56000 }, { "epoch": 4.0, "eval_accuracy": 0.5276129432475146, "eval_loss": 2.3943161964416504, "eval_runtime": 104.7687, "eval_samples_per_second": 447.347, "eval_steps_per_second": 6.996, "step": 56568 }, { "epoch": 4.030547305897327, "grad_norm": 0.1423817127943039, "learning_rate": 0.0007717053555108755, "loss": 2.0304, "step": 57000 }, { "epoch": 4.10125866214114, "grad_norm": 0.13736553490161896, "learning_rate": 0.0007625662584536648, "loss": 1.9815, "step": 58000 }, { "epoch": 4.171970018384952, "grad_norm": 0.1411396712064743, "learning_rate": 0.0007534363004935113, "loss": 1.9919, "step": 59000 }, { "epoch": 4.2426813746287655, "grad_norm": 0.14484618604183197, "learning_rate": 0.0007442972034363005, "loss": 1.9915, "step": 60000 }, { "epoch": 4.313392730872578, "grad_norm": 0.1606305092573166, "learning_rate": 0.000735167245476147, "loss": 1.9925, "step": 61000 }, { "epoch": 4.384104087116391, "grad_norm": 0.15816234052181244, "learning_rate": 0.0007260281484189363, "loss": 1.9963, "step": 62000 }, { "epoch": 4.454815443360204, "grad_norm": 0.14397823810577393, "learning_rate": 0.0007168981904587826, "loss": 1.9989, "step": 63000 }, { "epoch": 4.525526799604016, "grad_norm": 0.15473702549934387, "learning_rate": 0.0007077590934015719, "loss": 1.9965, "step": 64000 }, { "epoch": 4.596238155847829, "grad_norm": 0.14191265404224396, "learning_rate": 0.0006986291354414184, "loss": 2.0005, "step": 65000 }, { "epoch": 4.666949512091642, "grad_norm": 0.15206751227378845, "learning_rate": 0.0006894900383842077, "loss": 2.0114, "step": 66000 }, { "epoch": 4.737660868335455, "grad_norm": 0.18548937141895294, "learning_rate": 0.0006803600804240542, "loss": 2.0021, "step": 67000 }, { "epoch": 4.8083722245792675, "grad_norm": 0.16364724934101105, "learning_rate": 0.0006712209833668433, "loss": 2.0093, "step": 68000 }, { "epoch": 4.87908358082308, "grad_norm": 0.1373205929994583, "learning_rate": 0.0006620818863096326, "loss": 2.0073, "step": 69000 }, { "epoch": 4.949794937066893, "grad_norm": 0.15305304527282715, "learning_rate": 0.000652951928349479, "loss": 1.9957, "step": 70000 }, { "epoch": 5.0, "eval_accuracy": 0.5305311481637808, "eval_loss": 2.3786160945892334, "eval_runtime": 102.0738, "eval_samples_per_second": 459.158, "eval_steps_per_second": 7.181, "step": 70710 }, { "epoch": 5.020506293310706, "grad_norm": 0.17954622209072113, "learning_rate": 0.0006438128312922683, "loss": 1.963, "step": 71000 }, { "epoch": 5.091217649554518, "grad_norm": 0.17249706387519836, "learning_rate": 0.0006346828733321149, "loss": 1.8814, "step": 72000 }, { "epoch": 5.161929005798331, "grad_norm": 0.16035763919353485, "learning_rate": 0.000625543776274904, "loss": 1.8888, "step": 73000 }, { "epoch": 5.232640362042144, "grad_norm": 0.16601450741291046, "learning_rate": 0.0006164046792176932, "loss": 1.8945, "step": 74000 }, { "epoch": 5.303351718285957, "grad_norm": 0.1559607982635498, "learning_rate": 0.0006072747212575398, "loss": 1.9005, "step": 75000 }, { "epoch": 5.3740630745297695, "grad_norm": 0.1599714308977127, "learning_rate": 0.000598135624200329, "loss": 1.9056, "step": 76000 }, { "epoch": 5.444774430773582, "grad_norm": 0.15538254380226135, "learning_rate": 0.0005890056662401755, "loss": 1.9091, "step": 77000 }, { "epoch": 5.515485787017395, "grad_norm": 0.1645193099975586, "learning_rate": 0.0005798665691829647, "loss": 1.9138, "step": 78000 }, { "epoch": 5.586197143261208, "grad_norm": 0.1560288369655609, "learning_rate": 0.0005707366112228112, "loss": 1.9276, "step": 79000 }, { "epoch": 5.65690849950502, "grad_norm": 0.169467955827713, "learning_rate": 0.0005615975141656004, "loss": 1.9167, "step": 80000 }, { "epoch": 5.727619855748833, "grad_norm": 0.18090558052062988, "learning_rate": 0.0005524675562054469, "loss": 1.9289, "step": 81000 }, { "epoch": 5.798331211992646, "grad_norm": 0.16788819432258606, "learning_rate": 0.0005433284591482362, "loss": 1.9228, "step": 82000 }, { "epoch": 5.869042568236459, "grad_norm": 0.15961690247058868, "learning_rate": 0.0005341893620910255, "loss": 1.9178, "step": 83000 }, { "epoch": 5.9397539244802715, "grad_norm": 0.15657977759838104, "learning_rate": 0.0005250594041308718, "loss": 1.9161, "step": 84000 }, { "epoch": 6.0, "eval_accuracy": 0.5312578351518911, "eval_loss": 2.3910350799560547, "eval_runtime": 102.0407, "eval_samples_per_second": 459.307, "eval_steps_per_second": 7.183, "step": 84852 }, { "epoch": 6.010465280724084, "grad_norm": 0.15551112592220306, "learning_rate": 0.0005159203070736611, "loss": 1.9123, "step": 85000 }, { "epoch": 6.081176636967897, "grad_norm": 0.18589554727077484, "learning_rate": 0.0005067812100164504, "loss": 1.7906, "step": 86000 }, { "epoch": 6.15188799321171, "grad_norm": 0.16240116953849792, "learning_rate": 0.0004976512520562968, "loss": 1.805, "step": 87000 }, { "epoch": 6.222599349455523, "grad_norm": 0.1752467155456543, "learning_rate": 0.0004885121549990861, "loss": 1.8147, "step": 88000 }, { "epoch": 6.293310705699335, "grad_norm": 0.15973269939422607, "learning_rate": 0.00047937305794187537, "loss": 1.8063, "step": 89000 }, { "epoch": 6.364022061943148, "grad_norm": 0.18358197808265686, "learning_rate": 0.0004702430999817218, "loss": 1.8182, "step": 90000 }, { "epoch": 6.434733418186961, "grad_norm": 0.20550867915153503, "learning_rate": 0.00046110400292451105, "loss": 1.8251, "step": 91000 }, { "epoch": 6.5054447744307735, "grad_norm": 0.18148034811019897, "learning_rate": 0.0004519740449643575, "loss": 1.8283, "step": 92000 }, { "epoch": 6.576156130674587, "grad_norm": 0.1863207072019577, "learning_rate": 0.0004428349479071468, "loss": 1.834, "step": 93000 }, { "epoch": 6.646867486918399, "grad_norm": 0.1836949586868286, "learning_rate": 0.000433695850849936, "loss": 1.8257, "step": 94000 }, { "epoch": 6.717578843162212, "grad_norm": 0.18851223587989807, "learning_rate": 0.00042456589288978247, "loss": 1.8291, "step": 95000 }, { "epoch": 6.788290199406025, "grad_norm": 0.16575908660888672, "learning_rate": 0.00041542679583257176, "loss": 1.8412, "step": 96000 }, { "epoch": 6.859001555649837, "grad_norm": 0.1861979216337204, "learning_rate": 0.000406287698775361, "loss": 1.848, "step": 97000 }, { "epoch": 6.92971291189365, "grad_norm": 0.1783532202243805, "learning_rate": 0.00039714860171815024, "loss": 1.8361, "step": 98000 }, { "epoch": 7.0, "eval_accuracy": 0.5303533991815411, "eval_loss": 2.4205334186553955, "eval_runtime": 102.4141, "eval_samples_per_second": 457.632, "eval_steps_per_second": 7.157, "step": 98994 }, { "epoch": 7.000424268137463, "grad_norm": 0.1907605677843094, "learning_rate": 0.00038801864375799674, "loss": 1.8413, "step": 99000 }, { "epoch": 7.0711356243812755, "grad_norm": 0.21442489326000214, "learning_rate": 0.0003788795467007859, "loss": 1.6956, "step": 100000 }, { "epoch": 7.141846980625088, "grad_norm": 0.19562986493110657, "learning_rate": 0.0003697404496435752, "loss": 1.7053, "step": 101000 }, { "epoch": 7.212558336868901, "grad_norm": 0.23670311272144318, "learning_rate": 0.00036060135258636445, "loss": 1.7196, "step": 102000 }, { "epoch": 7.283269693112714, "grad_norm": 0.19641369581222534, "learning_rate": 0.00035148053372326815, "loss": 1.719, "step": 103000 }, { "epoch": 7.353981049356527, "grad_norm": 0.2086309790611267, "learning_rate": 0.0003423414366660574, "loss": 1.7279, "step": 104000 }, { "epoch": 7.424692405600339, "grad_norm": 0.1947568953037262, "learning_rate": 0.0003332023396088467, "loss": 1.7389, "step": 105000 }, { "epoch": 7.495403761844152, "grad_norm": 0.19536983966827393, "learning_rate": 0.00032407238164869313, "loss": 1.7428, "step": 106000 }, { "epoch": 7.566115118087965, "grad_norm": 0.1872589886188507, "learning_rate": 0.00031493328459148237, "loss": 1.7463, "step": 107000 }, { "epoch": 7.6368264743317775, "grad_norm": 0.22906361520290375, "learning_rate": 0.0003057941875342716, "loss": 1.7479, "step": 108000 }, { "epoch": 7.707537830575591, "grad_norm": 0.19299902021884918, "learning_rate": 0.0002966642295741181, "loss": 1.7514, "step": 109000 }, { "epoch": 7.778249186819403, "grad_norm": 0.19876809418201447, "learning_rate": 0.00028752513251690734, "loss": 1.7467, "step": 110000 }, { "epoch": 7.848960543063216, "grad_norm": 0.22273430228233337, "learning_rate": 0.0002783860354596966, "loss": 1.76, "step": 111000 }, { "epoch": 7.919671899307029, "grad_norm": 0.1979241669178009, "learning_rate": 0.0002692560774995431, "loss": 1.7547, "step": 112000 }, { "epoch": 7.990383255550841, "grad_norm": 0.2099294811487198, "learning_rate": 0.00026011698044233226, "loss": 1.7477, "step": 113000 }, { "epoch": 8.0, "eval_accuracy": 0.5282502161049046, "eval_loss": 2.474827289581299, "eval_runtime": 102.4954, "eval_samples_per_second": 457.269, "eval_steps_per_second": 7.152, "step": 113136 }, { "epoch": 8.061094611794655, "grad_norm": 0.24672599136829376, "learning_rate": 0.00025097788338512156, "loss": 1.6197, "step": 114000 }, { "epoch": 8.131805968038467, "grad_norm": 0.21202607452869415, "learning_rate": 0.00024183878632791082, "loss": 1.6192, "step": 115000 }, { "epoch": 8.20251732428228, "grad_norm": 0.24981403350830078, "learning_rate": 0.00023271796746481447, "loss": 1.6329, "step": 116000 }, { "epoch": 8.273228680526092, "grad_norm": 0.25290995836257935, "learning_rate": 0.00022357887040760373, "loss": 1.6386, "step": 117000 }, { "epoch": 8.343940036769904, "grad_norm": 0.2473640739917755, "learning_rate": 0.000214439773350393, "loss": 1.6414, "step": 118000 }, { "epoch": 8.414651393013719, "grad_norm": 0.20307676494121552, "learning_rate": 0.00020530981539023944, "loss": 1.6458, "step": 119000 }, { "epoch": 8.485362749257531, "grad_norm": 0.21696613729000092, "learning_rate": 0.0001961707183330287, "loss": 1.6473, "step": 120000 }, { "epoch": 8.556074105501343, "grad_norm": 0.23408186435699463, "learning_rate": 0.00018703162127581797, "loss": 1.656, "step": 121000 }, { "epoch": 8.626785461745156, "grad_norm": 0.23058977723121643, "learning_rate": 0.0001778925242186072, "loss": 1.6578, "step": 122000 }, { "epoch": 8.697496817988968, "grad_norm": 0.23317036032676697, "learning_rate": 0.00016877170535551086, "loss": 1.6516, "step": 123000 }, { "epoch": 8.768208174232782, "grad_norm": 0.2361781746149063, "learning_rate": 0.00015963260829830012, "loss": 1.6525, "step": 124000 }, { "epoch": 8.838919530476595, "grad_norm": 0.260776549577713, "learning_rate": 0.00015049351124108936, "loss": 1.6547, "step": 125000 }, { "epoch": 8.909630886720407, "grad_norm": 0.2507932186126709, "learning_rate": 0.00014136355328093583, "loss": 1.6556, "step": 126000 }, { "epoch": 8.98034224296422, "grad_norm": 0.2422228902578354, "learning_rate": 0.0001322244562237251, "loss": 1.6549, "step": 127000 }, { "epoch": 9.0, "eval_accuracy": 0.5249380742803117, "eval_loss": 2.5581541061401367, "eval_runtime": 102.2383, "eval_samples_per_second": 458.419, "eval_steps_per_second": 7.17, "step": 127278 }, { "epoch": 9.051053599208032, "grad_norm": 0.2604562044143677, "learning_rate": 0.00012308535916651437, "loss": 1.5675, "step": 128000 }, { "epoch": 9.121764955451846, "grad_norm": 0.22102615237236023, "learning_rate": 0.0001139462621093036, "loss": 1.5337, "step": 129000 }, { "epoch": 9.192476311695659, "grad_norm": 0.2960878014564514, "learning_rate": 0.00010481630414915007, "loss": 1.5556, "step": 130000 }, { "epoch": 9.263187667939471, "grad_norm": 0.22400617599487305, "learning_rate": 9.567720709193931e-05, "loss": 1.5491, "step": 131000 }, { "epoch": 9.333899024183284, "grad_norm": 0.24257275462150574, "learning_rate": 8.655638822884298e-05, "loss": 1.5502, "step": 132000 }, { "epoch": 9.404610380427096, "grad_norm": 0.24599485099315643, "learning_rate": 7.741729117163225e-05, "loss": 1.552, "step": 133000 }, { "epoch": 9.47532173667091, "grad_norm": 0.25757452845573425, "learning_rate": 6.82781941144215e-05, "loss": 1.5576, "step": 134000 }, { "epoch": 9.546033092914723, "grad_norm": 0.28276532888412476, "learning_rate": 5.914823615426796e-05, "loss": 1.5529, "step": 135000 }, { "epoch": 9.616744449158535, "grad_norm": 0.2369563728570938, "learning_rate": 5.000913909705721e-05, "loss": 1.5548, "step": 136000 }, { "epoch": 9.687455805402347, "grad_norm": 0.25778231024742126, "learning_rate": 4.0870042039846464e-05, "loss": 1.55, "step": 137000 }, { "epoch": 9.75816716164616, "grad_norm": 0.2770988941192627, "learning_rate": 3.173094498263571e-05, "loss": 1.559, "step": 138000 }, { "epoch": 9.828878517889972, "grad_norm": 0.2701665163040161, "learning_rate": 2.261012611953939e-05, "loss": 1.5579, "step": 139000 }, { "epoch": 9.899589874133786, "grad_norm": 0.2540683448314667, "learning_rate": 1.3471029062328641e-05, "loss": 1.5563, "step": 140000 }, { "epoch": 9.970301230377599, "grad_norm": 0.26811909675598145, "learning_rate": 4.341071102175106e-06, "loss": 1.5611, "step": 141000 }, { "epoch": 10.0, "eval_accuracy": 0.5203706477236009, "eval_loss": 2.6626083850860596, "eval_runtime": 102.3167, "eval_samples_per_second": 458.068, "eval_steps_per_second": 7.164, "step": 141420 }, { "epoch": 10.0, "step": 141420, "total_flos": 6.171008476428288e+17, "train_loss": 2.0168114449748256, "train_runtime": 24063.5613, "train_samples_per_second": 188.054, "train_steps_per_second": 5.877 } ], "logging_steps": 1000, "max_steps": 141420, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.171008476428288e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }