|
{ |
|
"best_metric": 0.5838193893432617, |
|
"best_model_checkpoint": "./0.4b_finetuned_results/checkpoint-2500", |
|
"epoch": 3.7425149700598803, |
|
"eval_steps": 500, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014970059880239521, |
|
"grad_norm": 4.375, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 3.8882, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029940119760479042, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 3.2257, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04491017964071856, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 0.0002, |
|
"loss": 2.92, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.059880239520958084, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.00019999938668382333, |
|
"loss": 2.3984, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0748502994011976, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00019999754674281632, |
|
"loss": 2.1626, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08982035928143713, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.0001999944801995484, |
|
"loss": 2.0388, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10479041916167664, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.0001999901870916347, |
|
"loss": 2.0121, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00019998466747173592, |
|
"loss": 1.8579, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1347305389221557, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019997792140755746, |
|
"loss": 1.8254, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1497005988023952, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0001999699489818488, |
|
"loss": 1.7037, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16467065868263472, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00019996075029240219, |
|
"loss": 1.6647, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17964071856287425, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0001999503254520518, |
|
"loss": 1.5988, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19461077844311378, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00019993867458867207, |
|
"loss": 1.6197, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20958083832335328, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00019992579784517626, |
|
"loss": 1.5954, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2245508982035928, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019991169537951468, |
|
"loss": 1.5666, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00019989636736467278, |
|
"loss": 1.5227, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25449101796407186, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019987981398866887, |
|
"loss": 1.5048, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2694610778443114, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00019986203545455203, |
|
"loss": 1.4755, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2844311377245509, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001998430319803996, |
|
"loss": 1.4505, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2994011976047904, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00019982280379931422, |
|
"loss": 1.4295, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3143712574850299, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019980135115942136, |
|
"loss": 1.4683, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32934131736526945, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019977867432386604, |
|
"loss": 1.4427, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.344311377245509, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00019975477357080966, |
|
"loss": 1.3852, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019972964919342663, |
|
"loss": 1.427, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.37425149700598803, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019970330149990062, |
|
"loss": 1.3759, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38922155688622756, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019967573081342103, |
|
"loss": 1.3559, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4041916167664671, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019964693747217874, |
|
"loss": 1.3715, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.41916167664670656, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019961692182936225, |
|
"loss": 1.2932, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4341317365269461, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019958568425315314, |
|
"loss": 1.3086, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4491017964071856, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019955322512672162, |
|
"loss": 1.3091, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.46407185628742514, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00019951954484822182, |
|
"loss": 1.3196, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00019948464383078696, |
|
"loss": 1.2944, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4940119760479042, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00019944852250252418, |
|
"loss": 1.3461, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5089820359281437, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00019941118130650942, |
|
"loss": 1.3221, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5239520958083832, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00019937262070078183, |
|
"loss": 1.3111, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5389221556886228, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001993328411583383, |
|
"loss": 1.3128, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5538922155688623, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00019929184316712758, |
|
"loss": 1.2618, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5688622754491018, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019924962723004425, |
|
"loss": 1.2893, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5838323353293413, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001992061938649227, |
|
"loss": 1.2727, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001991615436045306, |
|
"loss": 1.293, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6137724550898204, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001991156769965625, |
|
"loss": 1.2692, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6287425149700598, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 1.2588, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6437125748502994, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019902029700327018, |
|
"loss": 1.2576, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6586826347305389, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001989707847879078, |
|
"loss": 1.2595, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6736526946107785, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00019892005856487878, |
|
"loss": 1.2331, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.688622754491018, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001988681189564074, |
|
"loss": 1.2161, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7035928143712575, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001988149665996017, |
|
"loss": 1.2675, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019876060214644566, |
|
"loss": 1.269, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7335329341317365, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00019870502626379127, |
|
"loss": 1.2342, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019864823963335033, |
|
"loss": 1.2351, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"eval_loss": 1.1021808385849, |
|
"eval_runtime": 109.4058, |
|
"eval_samples_per_second": 9.14, |
|
"eval_steps_per_second": 1.143, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7634730538922155, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00019859024295168593, |
|
"loss": 1.2235, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7784431137724551, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0001985310369302042, |
|
"loss": 1.2353, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7934131736526946, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00019847062229514533, |
|
"loss": 1.2445, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8083832335329342, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019840899978757485, |
|
"loss": 1.2687, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8233532934131736, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0001983461701633742, |
|
"loss": 1.2026, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019828213419323208, |
|
"loss": 1.2304, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8532934131736527, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019821689266263427, |
|
"loss": 1.1961, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8682634730538922, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00019815044637185456, |
|
"loss": 1.158, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8832335329341318, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00019808279613594464, |
|
"loss": 1.1804, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8982035928143712, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00019801394278472418, |
|
"loss": 1.1705, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9131736526946108, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0001979438871627707, |
|
"loss": 1.1816, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9281437125748503, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019787263012940905, |
|
"loss": 1.2516, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9431137724550899, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019780017255870114, |
|
"loss": 1.2214, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019772651533943493, |
|
"loss": 1.1855, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9730538922155688, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001976516593751137, |
|
"loss": 1.1784, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9880239520958084, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00019757560558394493, |
|
"loss": 1.194, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0029940119760479, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019749835489882905, |
|
"loss": 1.198, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0179640718562875, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00019741990826734794, |
|
"loss": 1.0274, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.032934131736527, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019734026665175334, |
|
"loss": 0.9878, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0479041916167664, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001972594310289551, |
|
"loss": 1.0292, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.062874251497006, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019717740239050914, |
|
"loss": 1.0265, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0778443113772456, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001970941817426052, |
|
"loss": 0.9696, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.092814371257485, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0001970097701060548, |
|
"loss": 0.9735, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1077844311377245, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00019692416851627826, |
|
"loss": 1.0029, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.122754491017964, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00019683737802329244, |
|
"loss": 1.0072, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1377245508982037, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001967493996916976, |
|
"loss": 1.0173, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.152694610778443, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00019666023460066442, |
|
"loss": 0.9945, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1676646706586826, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019656988384392075, |
|
"loss": 0.9927, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1826347305389222, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019647834852973818, |
|
"loss": 0.9995, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019638562978091853, |
|
"loss": 0.9957, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2125748502994012, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 0.9653, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2275449101796407, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00019619664654314302, |
|
"loss": 0.9714, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2425149700598803, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0001961003843723167, |
|
"loss": 1.0226, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2574850299401197, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019600294340308398, |
|
"loss": 1.0417, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2724550898203593, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019590432483068722, |
|
"loss": 0.9593, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2874251497005988, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019580452986481378, |
|
"loss": 1.0255, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3023952095808382, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019570355972958097, |
|
"loss": 0.9971, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3173652694610778, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019560141566352115, |
|
"loss": 0.9914, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3323353293413174, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0001954980989195665, |
|
"loss": 0.9699, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.347305389221557, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001953936107650336, |
|
"loss": 0.9667, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3622754491017965, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019528795248160795, |
|
"loss": 0.9813, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.377245508982036, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001951811253653283, |
|
"loss": 0.9861, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3922155688622755, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019507313072657055, |
|
"loss": 0.9772, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.407185628742515, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019496396989003193, |
|
"loss": 1.0045, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4221556886227544, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019485364419471454, |
|
"loss": 0.9919, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019474215499390912, |
|
"loss": 0.9796, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4520958083832336, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00019462950365517817, |
|
"loss": 0.9821, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.467065868263473, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019451569156033954, |
|
"loss": 1.0337, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4820359281437125, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019440072010544918, |
|
"loss": 0.9987, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4970059880239521, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019428459070078416, |
|
"loss": 1.004, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4970059880239521, |
|
"eval_loss": 0.9072233438491821, |
|
"eval_runtime": 109.5318, |
|
"eval_samples_per_second": 9.13, |
|
"eval_steps_per_second": 1.141, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5119760479041915, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019416730477082533, |
|
"loss": 0.9444, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5269461077844313, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019404886375423984, |
|
"loss": 0.9829, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5419161676646707, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019392926910386353, |
|
"loss": 0.9532, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.55688622754491, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019380852228668304, |
|
"loss": 0.9769, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5718562874251498, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00019368662478381799, |
|
"loss": 0.9783, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5868263473053892, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019356357809050247, |
|
"loss": 0.9881, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6017964071856288, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00019343938371606712, |
|
"loss": 0.9883, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6167664670658684, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019331404318392027, |
|
"loss": 0.9893, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6317365269461077, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00019318755803152945, |
|
"loss": 0.9851, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6467065868263473, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019305992981040246, |
|
"loss": 0.9531, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6616766467065869, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00019293116008606837, |
|
"loss": 0.9717, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6766467065868262, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00019280125043805824, |
|
"loss": 0.9699, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6916167664670658, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00019267020245988592, |
|
"loss": 0.9407, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7065868263473054, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00019253801775902824, |
|
"loss": 0.977, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7215568862275448, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001924046979569055, |
|
"loss": 0.9549, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7365269461077846, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00019227024468886157, |
|
"loss": 0.9824, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.751497005988024, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019213465960414368, |
|
"loss": 0.9936, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.7664670658682635, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019199794436588243, |
|
"loss": 1.0042, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.781437125748503, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001918601006510711, |
|
"loss": 0.9629, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7964071856287425, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 0.9522, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.811377245508982, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001915810345689622, |
|
"loss": 0.9806, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8263473053892216, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00019143981562477947, |
|
"loss": 0.9736, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.841317365269461, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00019129747505023436, |
|
"loss": 0.9701, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8562874251497006, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00019115401459132247, |
|
"loss": 0.9494, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8712574850299402, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00019100943600777615, |
|
"loss": 0.9922, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8862275449101795, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00019086374107304312, |
|
"loss": 0.9711, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9011976047904193, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00019071693157426457, |
|
"loss": 0.9664, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00019056900931225333, |
|
"loss": 0.9591, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.931137724550898, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00019041997610147167, |
|
"loss": 0.9942, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9461077844311379, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0001902698337700092, |
|
"loss": 0.9391, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9610778443113772, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019011858415956038, |
|
"loss": 0.9993, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9760479041916168, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001899662291254018, |
|
"loss": 0.9571, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9910179640718564, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001898127705363696, |
|
"loss": 0.9835, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.0059880239520957, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00018965821027483654, |
|
"loss": 0.9305, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.020958083832335, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00018950255023668876, |
|
"loss": 0.8295, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.035928143712575, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00018934579233130267, |
|
"loss": 0.7653, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0508982035928143, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00018918793848152142, |
|
"loss": 0.7581, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.065868263473054, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00018902899062363143, |
|
"loss": 0.7983, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.0808383233532934, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001888689507073385, |
|
"loss": 0.8187, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.095808383233533, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001887078206957441, |
|
"loss": 0.7879, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.1107784431137726, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 0.8083, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.125748502994012, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00018838229830588934, |
|
"loss": 0.8057, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1407185628742513, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00018821790992059196, |
|
"loss": 0.8194, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.155688622754491, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00018805243942587, |
|
"loss": 0.7958, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1706586826347305, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00018788588885143808, |
|
"loss": 0.8169, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.18562874251497, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00018771826024025946, |
|
"loss": 0.7722, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.2005988023952097, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001875495556485208, |
|
"loss": 0.7934, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.215568862275449, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00018737977714560738, |
|
"loss": 0.7915, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.230538922155689, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00018720892681407708, |
|
"loss": 0.8021, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.245508982035928, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018703700674963547, |
|
"loss": 0.7987, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.245508982035928, |
|
"eval_loss": 0.7798940539360046, |
|
"eval_runtime": 109.62, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 1.14, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2604790419161676, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00018686401906110964, |
|
"loss": 0.7979, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.2754491017964074, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00018668996587042252, |
|
"loss": 0.8255, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.2904191616766467, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00018651484931256685, |
|
"loss": 0.8252, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.305389221556886, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00018633867153557905, |
|
"loss": 0.8455, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.320359281437126, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00018616143470051263, |
|
"loss": 0.8118, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3353293413173652, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00018598314098141206, |
|
"loss": 0.8122, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.3502994011976046, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00018580379256528576, |
|
"loss": 0.7965, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.3652694610778444, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018562339165207936, |
|
"loss": 0.8309, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3802395209580838, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 0.8046, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001852594411987334, |
|
"loss": 0.8467, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.410179640718563, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00018507589612292783, |
|
"loss": 0.8566, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4251497005988023, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018489130747865548, |
|
"loss": 0.8297, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.440119760479042, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00018470567753014035, |
|
"loss": 0.7823, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.4550898203592815, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001845190085543795, |
|
"loss": 0.8558, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.470059880239521, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001843313028411149, |
|
"loss": 0.8262, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4850299401197606, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00018414256269280564, |
|
"loss": 0.7982, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00018395279042459937, |
|
"loss": 0.8182, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.5149700598802394, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00018376198836430417, |
|
"loss": 0.8275, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.529940119760479, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00018357015885235982, |
|
"loss": 0.8102, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.5449101796407185, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001833773042418092, |
|
"loss": 0.8145, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.5598802395209583, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00018318342689826938, |
|
"loss": 0.8279, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.5748502994011977, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00018298852919990252, |
|
"loss": 0.8354, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.589820359281437, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001827926135373869, |
|
"loss": 0.8106, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.6047904191616764, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00018259568231388738, |
|
"loss": 0.7983, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.6197604790419162, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00018239773794502607, |
|
"loss": 0.8183, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.6347305389221556, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018219878285885267, |
|
"loss": 0.8462, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.6497005988023954, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001819988194958146, |
|
"loss": 0.8227, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.6646706586826348, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001817978503087272, |
|
"loss": 0.8104, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.679640718562874, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001815958777627435, |
|
"loss": 0.7972, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.694610778443114, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00018139290433532416, |
|
"loss": 0.8339, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7095808383233533, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018118893251620682, |
|
"loss": 0.8723, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.724550898203593, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00018098396480737585, |
|
"loss": 0.8544, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.7395209580838324, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001807780037230315, |
|
"loss": 0.8557, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.754491017964072, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018057105178955905, |
|
"loss": 0.8283, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.769461077844311, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00018036311154549784, |
|
"loss": 0.7906, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.784431137724551, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001801541855415102, |
|
"loss": 0.8036, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.7994011976047903, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017994427634035015, |
|
"loss": 0.7828, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.81437125748503, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017973338651683176, |
|
"loss": 0.7915, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8293413173652695, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00017952151865779792, |
|
"loss": 0.8141, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.844311377245509, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00017930867536208826, |
|
"loss": 0.8155, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8592814371257482, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017909485924050758, |
|
"loss": 0.8004, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.874251497005988, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00017888007291579357, |
|
"loss": 0.803, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.8892215568862274, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00017866431902258478, |
|
"loss": 0.804, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.904191616766467, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017844760020738827, |
|
"loss": 0.8154, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9191616766467066, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00017822991912854713, |
|
"loss": 0.8257, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.934131736526946, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00017801127845620793, |
|
"loss": 0.8386, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.9491017964071857, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001777916808722879, |
|
"loss": 0.8003, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.964071856287425, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 0.8055, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.979041916167665, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00017734962575603, |
|
"loss": 0.8233, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00017712717364608328, |
|
"loss": 0.8106, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"eval_loss": 0.6911507844924927, |
|
"eval_runtime": 109.6711, |
|
"eval_samples_per_second": 9.118, |
|
"eval_steps_per_second": 1.14, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0089820359281436, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00017690377546927133, |
|
"loss": 0.7276, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.0239520958083834, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00017667943396586848, |
|
"loss": 0.6882, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.038922155688623, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001764541518877202, |
|
"loss": 0.6786, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.053892215568862, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017622793199820934, |
|
"loss": 0.6352, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.068862275449102, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00017600077707222224, |
|
"loss": 0.6648, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.0838323353293413, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00017577268989611472, |
|
"loss": 0.6601, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.0988023952095807, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00017554367326767792, |
|
"loss": 0.6645, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.1137724550898205, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00017531372999610384, |
|
"loss": 0.6631, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.12874251497006, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001750828629019511, |
|
"loss": 0.6451, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.143712574850299, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00017485107481711012, |
|
"loss": 0.6703, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.158682634730539, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00017461836858476856, |
|
"loss": 0.6598, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.1736526946107784, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00017438474705937639, |
|
"loss": 0.6689, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.1886227544910177, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001741502131066107, |
|
"loss": 0.6741, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.2035928143712575, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.000173914769603341, |
|
"loss": 0.6691, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.218562874251497, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00017367841943759338, |
|
"loss": 0.6702, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.2335329341317367, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00017344116550851543, |
|
"loss": 0.6451, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.248502994011976, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00017320301072634066, |
|
"loss": 0.6507, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.2634730538922154, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00017296395801235265, |
|
"loss": 0.695, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.2784431137724552, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017272401029884933, |
|
"loss": 0.6798, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.2934131736526946, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.000172483170529107, |
|
"loss": 0.6526, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.308383233532934, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00017224144165734417, |
|
"loss": 0.6538, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.3233532934131738, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00017199882664868538, |
|
"loss": 0.6777, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.338323353293413, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00017175532847912487, |
|
"loss": 0.6762, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.3532934131736525, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00017151095013548994, |
|
"loss": 0.673, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.3682634730538923, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00017126569461540443, |
|
"loss": 0.6757, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.3832335329341316, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00017101956492725185, |
|
"loss": 0.6563, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.3982035928143715, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00017077256409013866, |
|
"loss": 0.6877, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.413173652694611, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.000170524695133857, |
|
"loss": 0.67, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.42814371257485, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017027596109884768, |
|
"loss": 0.6808, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.44311377245509, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00017002636503616282, |
|
"loss": 0.6941, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.4580838323353293, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00016977591000742854, |
|
"loss": 0.6798, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.4730538922155687, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001695245990848072, |
|
"loss": 0.6718, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.4880239520958085, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00016927243535095997, |
|
"loss": 0.6483, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.502994011976048, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00016901942189900867, |
|
"loss": 0.7177, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.5179640718562872, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00016876556183249822, |
|
"loss": 0.6833, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.532934131736527, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00016851085826535838, |
|
"loss": 0.6826, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.5479041916167664, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 0.6835, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.562874251497006, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00016799893313660408, |
|
"loss": 0.6791, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.5778443113772456, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001677417178544289, |
|
"loss": 0.6787, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.592814371257485, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00016748367163042576, |
|
"loss": 0.6542, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.6077844311377243, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00016722479762987317, |
|
"loss": 0.6805, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.622754491017964, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001669650990282033, |
|
"loss": 0.6859, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.6377245508982035, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00016670457901096328, |
|
"loss": 0.6633, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.6526946107784433, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00016644324077377592, |
|
"loss": 0.6958, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.6676646706586826, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016618108752230052, |
|
"loss": 0.6965, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.682634730538922, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00016591812247219377, |
|
"loss": 0.6851, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.697604790419162, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00016565434884907002, |
|
"loss": 0.6669, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.712574850299401, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001653897698884619, |
|
"loss": 0.6672, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.727544910179641, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00016512438883578044, |
|
"loss": 0.7049, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.7425149700598803, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001648582089462756, |
|
"loss": 0.6821, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.7425149700598803, |
|
"eval_loss": 0.5838193893432617, |
|
"eval_runtime": 109.8377, |
|
"eval_samples_per_second": 9.104, |
|
"eval_steps_per_second": 1.138, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 9000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 14, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7382403865051136e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|