| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9996222709073053, |
| "eval_steps": 500, |
| "global_step": 3308, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0030218327415577548, |
| "grad_norm": 166.1145477294922, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 13.8334, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0060436654831155096, |
| "grad_norm": 81.33020782470703, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 13.1022, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.009065498224673264, |
| "grad_norm": 51.09122085571289, |
| "learning_rate": 3e-06, |
| "loss": 12.0899, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.012087330966231019, |
| "grad_norm": 49.01457214355469, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 10.2942, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.015109163707788774, |
| "grad_norm": 46.00205612182617, |
| "learning_rate": 5e-06, |
| "loss": 9.538, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01813099644934653, |
| "grad_norm": 41.61635971069336, |
| "learning_rate": 6e-06, |
| "loss": 8.3357, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.021152829190904283, |
| "grad_norm": 31.584325790405273, |
| "learning_rate": 7e-06, |
| "loss": 6.9115, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.024174661932462038, |
| "grad_norm": 51.664695739746094, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 6.7596, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.027196494674019793, |
| "grad_norm": 31.014076232910156, |
| "learning_rate": 9e-06, |
| "loss": 7.0298, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.030218327415577548, |
| "grad_norm": 37.792179107666016, |
| "learning_rate": 1e-05, |
| "loss": 6.7021, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0332401601571353, |
| "grad_norm": 37.68498611450195, |
| "learning_rate": 9.999760394462267e-06, |
| "loss": 7.3545, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.03626199289869306, |
| "grad_norm": 38.4805793762207, |
| "learning_rate": 9.999041600813393e-06, |
| "loss": 7.0073, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03928382564025081, |
| "grad_norm": 32.300174713134766, |
| "learning_rate": 9.997843687944153e-06, |
| "loss": 6.2416, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.04230565838180857, |
| "grad_norm": 29.263317108154297, |
| "learning_rate": 9.996166770665168e-06, |
| "loss": 5.5583, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.04532749112336632, |
| "grad_norm": 33.3656005859375, |
| "learning_rate": 9.994011009695908e-06, |
| "loss": 5.6737, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.048349323864924076, |
| "grad_norm": 32.699825286865234, |
| "learning_rate": 9.991376611649278e-06, |
| "loss": 6.0879, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.05137115660648183, |
| "grad_norm": 27.45968246459961, |
| "learning_rate": 9.988263829011821e-06, |
| "loss": 5.4056, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.054392989348039586, |
| "grad_norm": 25.30878448486328, |
| "learning_rate": 9.984672960119523e-06, |
| "loss": 5.3618, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.05741482208959734, |
| "grad_norm": 40.055721282958984, |
| "learning_rate": 9.980604349129212e-06, |
| "loss": 5.7602, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.060436654831155096, |
| "grad_norm": 26.245195388793945, |
| "learning_rate": 9.976058385985575e-06, |
| "loss": 5.186, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06345848757271286, |
| "grad_norm": 34.81965637207031, |
| "learning_rate": 9.971035506383791e-06, |
| "loss": 5.5341, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0664803203142706, |
| "grad_norm": 29.513893127441406, |
| "learning_rate": 9.96553619172777e-06, |
| "loss": 5.0542, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.06950215305582837, |
| "grad_norm": 32.30284118652344, |
| "learning_rate": 9.959560969084004e-06, |
| "loss": 5.3365, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.07252398579738611, |
| "grad_norm": 27.652576446533203, |
| "learning_rate": 9.953110411131073e-06, |
| "loss": 4.7513, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.07554581853894388, |
| "grad_norm": 28.387413024902344, |
| "learning_rate": 9.946185136104736e-06, |
| "loss": 5.4127, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.07856765128050162, |
| "grad_norm": 29.694316864013672, |
| "learning_rate": 9.938785807738692e-06, |
| "loss": 4.813, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.08158948402205939, |
| "grad_norm": 31.964120864868164, |
| "learning_rate": 9.930913135200964e-06, |
| "loss": 5.4212, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.08461131676361713, |
| "grad_norm": 23.594715118408203, |
| "learning_rate": 9.922567873025924e-06, |
| "loss": 5.2445, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0876331495051749, |
| "grad_norm": 23.0896053314209, |
| "learning_rate": 9.913750821041988e-06, |
| "loss": 4.5194, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.09065498224673264, |
| "grad_norm": 25.44329833984375, |
| "learning_rate": 9.904462824294945e-06, |
| "loss": 4.6093, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0936768149882904, |
| "grad_norm": 27.408288955688477, |
| "learning_rate": 9.894704772966978e-06, |
| "loss": 4.512, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.09669864772984815, |
| "grad_norm": 24.26542091369629, |
| "learning_rate": 9.884477602291343e-06, |
| "loss": 4.5071, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.09972048047140591, |
| "grad_norm": 35.8819694519043, |
| "learning_rate": 9.873782292462727e-06, |
| "loss": 4.3557, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.10274231321296366, |
| "grad_norm": 29.487594604492188, |
| "learning_rate": 9.862619868543323e-06, |
| "loss": 8.2236, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.10576414595452142, |
| "grad_norm": 38.13749694824219, |
| "learning_rate": 9.850991400364557e-06, |
| "loss": 5.1538, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.10878597869607917, |
| "grad_norm": 25.492799758911133, |
| "learning_rate": 9.838898002424586e-06, |
| "loss": 6.0666, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.11180781143763693, |
| "grad_norm": 31.119089126586914, |
| "learning_rate": 9.826340833781448e-06, |
| "loss": 5.8633, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.11482964417919468, |
| "grad_norm": 21.065149307250977, |
| "learning_rate": 9.813321097942005e-06, |
| "loss": 5.1017, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.11785147692075244, |
| "grad_norm": 29.40814971923828, |
| "learning_rate": 9.79984004274658e-06, |
| "loss": 4.9132, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.12087330966231019, |
| "grad_norm": 22.45477294921875, |
| "learning_rate": 9.785898960249365e-06, |
| "loss": 4.2496, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.12389514240386795, |
| "grad_norm": 19.05487060546875, |
| "learning_rate": 9.771499186594586e-06, |
| "loss": 5.0767, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.12691697514542571, |
| "grad_norm": 31.310686111450195, |
| "learning_rate": 9.756642101888449e-06, |
| "loss": 5.192, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.12993880788698345, |
| "grad_norm": 25.689640045166016, |
| "learning_rate": 9.74132913006686e-06, |
| "loss": 3.445, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.1329606406285412, |
| "grad_norm": 21.052574157714844, |
| "learning_rate": 9.725561738758956e-06, |
| "loss": 3.3354, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.13598247337009897, |
| "grad_norm": 24.987884521484375, |
| "learning_rate": 9.709341439146452e-06, |
| "loss": 5.0777, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.13900430611165673, |
| "grad_norm": 26.842397689819336, |
| "learning_rate": 9.692669785818787e-06, |
| "loss": 6.4292, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.14202613885321447, |
| "grad_norm": 35.66836166381836, |
| "learning_rate": 9.675548376624149e-06, |
| "loss": 5.7348, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.14504797159477223, |
| "grad_norm": 29.318471908569336, |
| "learning_rate": 9.657978852516318e-06, |
| "loss": 5.6924, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.14806980433633, |
| "grad_norm": 23.544092178344727, |
| "learning_rate": 9.639962897397405e-06, |
| "loss": 4.183, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.15109163707788775, |
| "grad_norm": 22.90180206298828, |
| "learning_rate": 9.621502237956452e-06, |
| "loss": 5.085, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1541134698194455, |
| "grad_norm": 23.748275756835938, |
| "learning_rate": 9.602598643503957e-06, |
| "loss": 3.2694, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.15713530256100325, |
| "grad_norm": 29.096708297729492, |
| "learning_rate": 9.583253925802283e-06, |
| "loss": 4.2373, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.160157135302561, |
| "grad_norm": 24.87314796447754, |
| "learning_rate": 9.563469938892023e-06, |
| "loss": 4.8482, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.16317896804411877, |
| "grad_norm": 24.310091018676758, |
| "learning_rate": 9.543248578914309e-06, |
| "loss": 3.2299, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.1662008007856765, |
| "grad_norm": 24.80878257751465, |
| "learning_rate": 9.522591783929069e-06, |
| "loss": 4.8424, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.16922263352723427, |
| "grad_norm": 22.142215728759766, |
| "learning_rate": 9.501501533729297e-06, |
| "loss": 4.1786, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.17224446626879203, |
| "grad_norm": 33.77587890625, |
| "learning_rate": 9.479979849651287e-06, |
| "loss": 5.7505, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.1752662990103498, |
| "grad_norm": 25.414831161499023, |
| "learning_rate": 9.45802879438091e-06, |
| "loss": 6.3392, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.17828813175190752, |
| "grad_norm": 25.716073989868164, |
| "learning_rate": 9.43565047175593e-06, |
| "loss": 4.1603, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.1813099644934653, |
| "grad_norm": 25.389522552490234, |
| "learning_rate": 9.412847026564359e-06, |
| "loss": 3.9676, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.18433179723502305, |
| "grad_norm": 22.911640167236328, |
| "learning_rate": 9.389620644338893e-06, |
| "loss": 4.1508, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.1873536299765808, |
| "grad_norm": 36.27210998535156, |
| "learning_rate": 9.365973551147453e-06, |
| "loss": 4.691, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.19037546271813854, |
| "grad_norm": 23.555246353149414, |
| "learning_rate": 9.341908013379832e-06, |
| "loss": 4.7148, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.1933972954596963, |
| "grad_norm": 25.42097282409668, |
| "learning_rate": 9.317426337530477e-06, |
| "loss": 4.0105, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.19641912820125407, |
| "grad_norm": 24.92901611328125, |
| "learning_rate": 9.292530869977432e-06, |
| "loss": 5.5589, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.19944096094281183, |
| "grad_norm": 26.411352157592773, |
| "learning_rate": 9.26722399675745e-06, |
| "loss": 3.1881, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.20246279368436956, |
| "grad_norm": 22.39121437072754, |
| "learning_rate": 9.24150814333732e-06, |
| "loss": 3.9177, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.20548462642592732, |
| "grad_norm": 21.436046600341797, |
| "learning_rate": 9.215385774381395e-06, |
| "loss": 6.2124, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2085064591674851, |
| "grad_norm": 42.19996643066406, |
| "learning_rate": 9.188859393515382e-06, |
| "loss": 4.863, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.21152829190904285, |
| "grad_norm": 24.43948745727539, |
| "learning_rate": 9.16193154308638e-06, |
| "loss": 6.0562, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21455012465060058, |
| "grad_norm": 36.5896110534668, |
| "learning_rate": 9.13460480391922e-06, |
| "loss": 6.1878, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.21757195739215834, |
| "grad_norm": 39.19657897949219, |
| "learning_rate": 9.106881795069116e-06, |
| "loss": 6.4964, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2205937901337161, |
| "grad_norm": 19.438859939575195, |
| "learning_rate": 9.078765173570649e-06, |
| "loss": 3.1914, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.22361562287527387, |
| "grad_norm": 26.316898345947266, |
| "learning_rate": 9.0502576341831e-06, |
| "loss": 4.0543, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2266374556168316, |
| "grad_norm": 21.5406436920166, |
| "learning_rate": 9.02136190913219e-06, |
| "loss": 5.4649, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.22965928835838936, |
| "grad_norm": 38.014617919921875, |
| "learning_rate": 8.99208076784822e-06, |
| "loss": 4.6499, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.23268112109994712, |
| "grad_norm": 16.046876907348633, |
| "learning_rate": 8.962417016700624e-06, |
| "loss": 3.0368, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2357029538415049, |
| "grad_norm": 25.170169830322266, |
| "learning_rate": 8.932373498729026e-06, |
| "loss": 4.6374, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.23872478658306262, |
| "grad_norm": 28.294591903686523, |
| "learning_rate": 8.901953093370734e-06, |
| "loss": 4.0344, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.24174661932462038, |
| "grad_norm": 25.618423461914062, |
| "learning_rate": 8.871158716184784e-06, |
| "loss": 3.9153, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24476845206617814, |
| "grad_norm": 33.044132232666016, |
| "learning_rate": 8.839993318572497e-06, |
| "loss": 4.852, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2477902848077359, |
| "grad_norm": 19.522127151489258, |
| "learning_rate": 8.808459887494617e-06, |
| "loss": 3.0679, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.25081211754929367, |
| "grad_norm": 17.915157318115234, |
| "learning_rate": 8.77656144518502e-06, |
| "loss": 3.832, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.25383395029085143, |
| "grad_norm": 18.468053817749023, |
| "learning_rate": 8.744301048861083e-06, |
| "loss": 2.9134, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.25685578303240914, |
| "grad_norm": 25.19109535217285, |
| "learning_rate": 8.711681790430646e-06, |
| "loss": 2.9987, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2598776157739669, |
| "grad_norm": 27.227184295654297, |
| "learning_rate": 8.678706796195694e-06, |
| "loss": 4.7592, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.26289944851552466, |
| "grad_norm": 28.04375457763672, |
| "learning_rate": 8.645379226552712e-06, |
| "loss": 3.7402, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.2659212812570824, |
| "grad_norm": 21.457616806030273, |
| "learning_rate": 8.611702275689805e-06, |
| "loss": 4.6756, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.2689431139986402, |
| "grad_norm": 35.01508331298828, |
| "learning_rate": 8.577679171280538e-06, |
| "loss": 4.5315, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.27196494674019794, |
| "grad_norm": 20.160045623779297, |
| "learning_rate": 8.543313174174601e-06, |
| "loss": 5.2698, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2749867794817557, |
| "grad_norm": 22.52850341796875, |
| "learning_rate": 8.508607578085281e-06, |
| "loss": 3.849, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.27800861222331347, |
| "grad_norm": 21.895462036132812, |
| "learning_rate": 8.473565709273786e-06, |
| "loss": 3.8616, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.2810304449648712, |
| "grad_norm": 16.077316284179688, |
| "learning_rate": 8.438190926230439e-06, |
| "loss": 3.8386, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.28405227770642894, |
| "grad_norm": 33.1984977722168, |
| "learning_rate": 8.40248661935281e-06, |
| "loss": 4.3994, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.2870741104479867, |
| "grad_norm": 27.1571102142334, |
| "learning_rate": 8.366456210620756e-06, |
| "loss": 3.1029, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.29009594318954446, |
| "grad_norm": 31.706750869750977, |
| "learning_rate": 8.330103153268464e-06, |
| "loss": 3.7567, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.2931177759311022, |
| "grad_norm": 24.30504608154297, |
| "learning_rate": 8.29343093145347e-06, |
| "loss": 3.6988, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.29613960867266, |
| "grad_norm": 24.231523513793945, |
| "learning_rate": 8.25644305992275e-06, |
| "loss": 3.6097, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.29916144141421774, |
| "grad_norm": 19.621383666992188, |
| "learning_rate": 8.21914308367584e-06, |
| "loss": 4.5566, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.3021832741557755, |
| "grad_norm": 21.627859115600586, |
| "learning_rate": 8.181534577625088e-06, |
| "loss": 3.7714, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3052051068973332, |
| "grad_norm": 14.206421852111816, |
| "learning_rate": 8.143621146253022e-06, |
| "loss": 4.6373, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.308226939638891, |
| "grad_norm": 27.084983825683594, |
| "learning_rate": 8.105406423266884e-06, |
| "loss": 4.6538, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.31124877238044873, |
| "grad_norm": 20.950910568237305, |
| "learning_rate": 8.066894071250374e-06, |
| "loss": 4.4614, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3142706051220065, |
| "grad_norm": 20.357742309570312, |
| "learning_rate": 8.02808778131262e-06, |
| "loss": 3.7694, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.31729243786356426, |
| "grad_norm": 18.685476303100586, |
| "learning_rate": 7.988991272734407e-06, |
| "loss": 4.4575, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.320314270605122, |
| "grad_norm": 24.249338150024414, |
| "learning_rate": 7.94960829261172e-06, |
| "loss": 4.4394, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3233361033466798, |
| "grad_norm": 22.846027374267578, |
| "learning_rate": 7.909942615496613e-06, |
| "loss": 4.7241, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.32635793608823754, |
| "grad_norm": 30.40308952331543, |
| "learning_rate": 7.869998043035442e-06, |
| "loss": 5.3999, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.32937976882979525, |
| "grad_norm": 17.647789001464844, |
| "learning_rate": 7.829778403604512e-06, |
| "loss": 5.0469, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.332401601571353, |
| "grad_norm": 33.98617935180664, |
| "learning_rate": 7.789287551943158e-06, |
| "loss": 6.0896, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3354234343129108, |
| "grad_norm": 21.646024703979492, |
| "learning_rate": 7.748529368784293e-06, |
| "loss": 4.5196, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.33844526705446853, |
| "grad_norm": 18.94881820678711, |
| "learning_rate": 7.707507760482473e-06, |
| "loss": 6.1607, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.3414670997960263, |
| "grad_norm": 18.058412551879883, |
| "learning_rate": 7.666226658639507e-06, |
| "loss": 3.7909, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.34448893253758406, |
| "grad_norm": 22.541349411010742, |
| "learning_rate": 7.624690019727636e-06, |
| "loss": 3.638, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.3475107652791418, |
| "grad_norm": 23.882991790771484, |
| "learning_rate": 7.58290182471034e-06, |
| "loss": 4.53, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3505325980206996, |
| "grad_norm": 19.6879940032959, |
| "learning_rate": 7.5408660786607976e-06, |
| "loss": 3.6987, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3535544307622573, |
| "grad_norm": 20.6401309967041, |
| "learning_rate": 7.498586810378019e-06, |
| "loss": 2.9513, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.35657626350381505, |
| "grad_norm": 22.658132553100586, |
| "learning_rate": 7.456068072000731e-06, |
| "loss": 2.8103, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3595980962453728, |
| "grad_norm": 23.935726165771484, |
| "learning_rate": 7.4133139386190026e-06, |
| "loss": 4.5498, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.3626199289869306, |
| "grad_norm": 18.697385787963867, |
| "learning_rate": 7.3703285078836796e-06, |
| "loss": 5.2042, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36564176172848833, |
| "grad_norm": 17.5216064453125, |
| "learning_rate": 7.3271158996136625e-06, |
| "loss": 3.7229, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3686635944700461, |
| "grad_norm": 18.313034057617188, |
| "learning_rate": 7.283680255401049e-06, |
| "loss": 4.403, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.37168542721160386, |
| "grad_norm": 19.784748077392578, |
| "learning_rate": 7.240025738214193e-06, |
| "loss": 6.1978, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.3747072599531616, |
| "grad_norm": 33.28024673461914, |
| "learning_rate": 7.196156531998718e-06, |
| "loss": 4.4892, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.3777290926947193, |
| "grad_norm": 20.449913024902344, |
| "learning_rate": 7.152076841276527e-06, |
| "loss": 3.6566, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3807509254362771, |
| "grad_norm": 19.441957473754883, |
| "learning_rate": 7.1077908907428154e-06, |
| "loss": 3.7812, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.38377275817783485, |
| "grad_norm": 32.515724182128906, |
| "learning_rate": 7.063302924861182e-06, |
| "loss": 3.8969, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.3867945909193926, |
| "grad_norm": 22.129140853881836, |
| "learning_rate": 7.018617207456821e-06, |
| "loss": 3.5997, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.3898164236609504, |
| "grad_norm": 19.576011657714844, |
| "learning_rate": 6.973738021307872e-06, |
| "loss": 3.6646, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.39283825640250813, |
| "grad_norm": 17.848796844482422, |
| "learning_rate": 6.9286696677349455e-06, |
| "loss": 5.9623, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3958600891440659, |
| "grad_norm": 15.815289497375488, |
| "learning_rate": 6.883416466188881e-06, |
| "loss": 3.6821, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.39888192188562366, |
| "grad_norm": 17.62392807006836, |
| "learning_rate": 6.837982753836755e-06, |
| "loss": 2.8778, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.40190375462718136, |
| "grad_norm": 34.39213180541992, |
| "learning_rate": 6.7923728851461955e-06, |
| "loss": 6.0046, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4049255873687391, |
| "grad_norm": 22.834793090820312, |
| "learning_rate": 6.74659123146805e-06, |
| "loss": 3.6498, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.4079474201102969, |
| "grad_norm": 18.146869659423828, |
| "learning_rate": 6.70064218061742e-06, |
| "loss": 2.8181, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.41096925285185465, |
| "grad_norm": 18.262357711791992, |
| "learning_rate": 6.654530136453119e-06, |
| "loss": 4.3635, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.4139910855934124, |
| "grad_norm": 18.1636905670166, |
| "learning_rate": 6.608259518455599e-06, |
| "loss": 5.2127, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.4170129183349702, |
| "grad_norm": 17.246234893798828, |
| "learning_rate": 6.5618347613033875e-06, |
| "loss": 5.1173, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.42003475107652793, |
| "grad_norm": 19.54306983947754, |
| "learning_rate": 6.5152603144480406e-06, |
| "loss": 5.9817, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.4230565838180857, |
| "grad_norm": 31.445457458496094, |
| "learning_rate": 6.468540641687716e-06, |
| "loss": 4.5568, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4260784165596434, |
| "grad_norm": 19.258493423461914, |
| "learning_rate": 6.421680220739337e-06, |
| "loss": 3.9311, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.42910024930120116, |
| "grad_norm": 33.21185302734375, |
| "learning_rate": 6.374683542809447e-06, |
| "loss": 7.8417, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.4321220820427589, |
| "grad_norm": 19.956239700317383, |
| "learning_rate": 6.327555112163761e-06, |
| "loss": 4.3582, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.4351439147843167, |
| "grad_norm": 19.256486892700195, |
| "learning_rate": 6.280299445695469e-06, |
| "loss": 5.2, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.43816574752587445, |
| "grad_norm": 20.045286178588867, |
| "learning_rate": 6.232921072492319e-06, |
| "loss": 4.3409, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4411875802674322, |
| "grad_norm": 24.16641616821289, |
| "learning_rate": 6.185424533402543e-06, |
| "loss": 4.3162, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.44420941300899, |
| "grad_norm": 23.316164016723633, |
| "learning_rate": 6.13781438059966e-06, |
| "loss": 3.5112, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.44723124575054773, |
| "grad_norm": 34.204627990722656, |
| "learning_rate": 6.090095177146178e-06, |
| "loss": 5.1696, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.45025307849210544, |
| "grad_norm": 17.53434181213379, |
| "learning_rate": 6.042271496556255e-06, |
| "loss": 2.7874, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.4532749112336632, |
| "grad_norm": 21.362934112548828, |
| "learning_rate": 5.994347922357372e-06, |
| "loss": 3.8133, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45629674397522096, |
| "grad_norm": 19.935638427734375, |
| "learning_rate": 5.946329047651037e-06, |
| "loss": 3.592, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4593185767167787, |
| "grad_norm": 17.95412826538086, |
| "learning_rate": 5.8982194746725686e-06, |
| "loss": 2.7345, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.4623404094583365, |
| "grad_norm": 24.026193618774414, |
| "learning_rate": 5.850023814350007e-06, |
| "loss": 4.2519, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.46536224219989425, |
| "grad_norm": 12.00658893585205, |
| "learning_rate": 5.801746685862197e-06, |
| "loss": 6.0717, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.468384074941452, |
| "grad_norm": 14.519695281982422, |
| "learning_rate": 5.753392716196069e-06, |
| "loss": 2.8474, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4714059076830098, |
| "grad_norm": 15.277630805969238, |
| "learning_rate": 5.704966539703185e-06, |
| "loss": 3.6301, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.4744277404245675, |
| "grad_norm": 17.934938430786133, |
| "learning_rate": 5.656472797655571e-06, |
| "loss": 4.4189, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.47744957316612524, |
| "grad_norm": 17.185529708862305, |
| "learning_rate": 5.60791613780088e-06, |
| "loss": 2.7758, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.480471405907683, |
| "grad_norm": 25.111557006835938, |
| "learning_rate": 5.5593012139169525e-06, |
| "loss": 4.296, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.48349323864924076, |
| "grad_norm": 23.77570343017578, |
| "learning_rate": 5.510632685365777e-06, |
| "loss": 4.4462, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4865150713907985, |
| "grad_norm": 17.37128448486328, |
| "learning_rate": 5.461915216646938e-06, |
| "loss": 2.7426, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.4895369041323563, |
| "grad_norm": 23.484580993652344, |
| "learning_rate": 5.41315347695055e-06, |
| "loss": 4.2378, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.49255873687391405, |
| "grad_norm": 23.495826721191406, |
| "learning_rate": 5.364352139709758e-06, |
| "loss": 4.8879, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.4955805696154718, |
| "grad_norm": 16.23356819152832, |
| "learning_rate": 5.315515882152822e-06, |
| "loss": 3.5359, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.4986024023570295, |
| "grad_norm": 16.77799415588379, |
| "learning_rate": 5.266649384854842e-06, |
| "loss": 4.2516, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5016242350985873, |
| "grad_norm": 21.264799118041992, |
| "learning_rate": 5.217757331289165e-06, |
| "loss": 3.6844, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5046460678401451, |
| "grad_norm": 18.198184967041016, |
| "learning_rate": 5.168844407378506e-06, |
| "loss": 4.8485, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5076679005817029, |
| "grad_norm": 13.497072219848633, |
| "learning_rate": 5.119915301045836e-06, |
| "loss": 2.8835, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.5106897333232605, |
| "grad_norm": 24.342716217041016, |
| "learning_rate": 5.070974701765089e-06, |
| "loss": 5.1527, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.5137115660648183, |
| "grad_norm": 25.917234420776367, |
| "learning_rate": 5.022027300111712e-06, |
| "loss": 4.3981, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.516733398806376, |
| "grad_norm": 15.280237197875977, |
| "learning_rate": 4.973077787313099e-06, |
| "loss": 4.4554, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.5197552315479338, |
| "grad_norm": 17.290264129638672, |
| "learning_rate": 4.924130854798983e-06, |
| "loss": 5.1108, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.5227770642894916, |
| "grad_norm": 15.63051700592041, |
| "learning_rate": 4.875191193751803e-06, |
| "loss": 2.8006, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.5257988970310493, |
| "grad_norm": 15.663633346557617, |
| "learning_rate": 4.826263494657077e-06, |
| "loss": 3.4979, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.5288207297726071, |
| "grad_norm": 35.42136001586914, |
| "learning_rate": 4.777352446853863e-06, |
| "loss": 4.9996, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5318425625141648, |
| "grad_norm": 23.063594818115234, |
| "learning_rate": 4.72846273808533e-06, |
| "loss": 3.509, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.5348643952557226, |
| "grad_norm": 21.706233978271484, |
| "learning_rate": 4.679599054049458e-06, |
| "loss": 3.3899, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.5378862279972804, |
| "grad_norm": 20.82579231262207, |
| "learning_rate": 4.630766077949965e-06, |
| "loss": 5.9861, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.5409080607388381, |
| "grad_norm": 32.06898880004883, |
| "learning_rate": 4.5819684900474484e-06, |
| "loss": 4.3172, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.5439298934803959, |
| "grad_norm": 16.330984115600586, |
| "learning_rate": 4.5332109672108245e-06, |
| "loss": 4.4365, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5469517262219536, |
| "grad_norm": 17.189834594726562, |
| "learning_rate": 4.484498182469085e-06, |
| "loss": 3.6319, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.5499735589635114, |
| "grad_norm": 19.211336135864258, |
| "learning_rate": 4.435834804563422e-06, |
| "loss": 5.8999, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.5529953917050692, |
| "grad_norm": 26.310638427734375, |
| "learning_rate": 4.387225497499767e-06, |
| "loss": 3.5792, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5560172244466269, |
| "grad_norm": 20.680715560913086, |
| "learning_rate": 4.3386749201017856e-06, |
| "loss": 3.4555, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.5590390571881846, |
| "grad_norm": 15.533769607543945, |
| "learning_rate": 4.290187725564356e-06, |
| "loss": 6.0278, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5620608899297423, |
| "grad_norm": 13.684257507324219, |
| "learning_rate": 4.2417685610076135e-06, |
| "loss": 3.4758, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5650827226713001, |
| "grad_norm": 15.711587905883789, |
| "learning_rate": 4.193422067031535e-06, |
| "loss": 4.3166, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.5681045554128579, |
| "grad_norm": 18.764991760253906, |
| "learning_rate": 4.145152877271196e-06, |
| "loss": 4.1625, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.5711263881544156, |
| "grad_norm": 19.19873809814453, |
| "learning_rate": 4.096965617952667e-06, |
| "loss": 4.4233, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.5741482208959734, |
| "grad_norm": 20.817365646362305, |
| "learning_rate": 4.048864907449619e-06, |
| "loss": 3.5268, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5771700536375312, |
| "grad_norm": 18.440645217895508, |
| "learning_rate": 4.000855355840695e-06, |
| "loss": 3.5747, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.5801918863790889, |
| "grad_norm": 15.997143745422363, |
| "learning_rate": 3.952941564467665e-06, |
| "loss": 4.2257, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.5832137191206467, |
| "grad_norm": 20.629562377929688, |
| "learning_rate": 3.905128125494427e-06, |
| "loss": 4.3136, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.5862355518622044, |
| "grad_norm": 33.730995178222656, |
| "learning_rate": 3.8574196214668876e-06, |
| "loss": 4.509, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.5892573846037622, |
| "grad_norm": 30.045576095581055, |
| "learning_rate": 3.8098206248737486e-06, |
| "loss": 5.139, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.59227921734532, |
| "grad_norm": 23.693470001220703, |
| "learning_rate": 3.7623356977082794e-06, |
| "loss": 2.5913, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.5953010500868777, |
| "grad_norm": 18.655092239379883, |
| "learning_rate": 3.714969391031084e-06, |
| "loss": 4.3328, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.5983228828284355, |
| "grad_norm": 15.45345687866211, |
| "learning_rate": 3.6677262445339136e-06, |
| "loss": 3.5691, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.6013447155699932, |
| "grad_norm": 21.302995681762695, |
| "learning_rate": 3.6206107861045803e-06, |
| "loss": 2.5934, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.604366548311551, |
| "grad_norm": 13.75935173034668, |
| "learning_rate": 3.5736275313929826e-06, |
| "loss": 4.3405, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6073883810531087, |
| "grad_norm": 17.593429565429688, |
| "learning_rate": 3.5267809833783213e-06, |
| "loss": 4.8443, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.6104102137946664, |
| "grad_norm": 23.467853546142578, |
| "learning_rate": 3.4800756319375326e-06, |
| "loss": 3.4879, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.6134320465362242, |
| "grad_norm": 25.12725830078125, |
| "learning_rate": 3.433515953414953e-06, |
| "loss": 2.7966, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.616453879277782, |
| "grad_norm": 33.0245475769043, |
| "learning_rate": 3.387106410193308e-06, |
| "loss": 5.8078, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.6194757120193397, |
| "grad_norm": 18.8001651763916, |
| "learning_rate": 3.3408514502660195e-06, |
| "loss": 5.2049, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6224975447608975, |
| "grad_norm": 16.787553787231445, |
| "learning_rate": 3.2947555068109057e-06, |
| "loss": 3.3988, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.6255193775024552, |
| "grad_norm": 21.532262802124023, |
| "learning_rate": 3.248822997765295e-06, |
| "loss": 2.815, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.628541210244013, |
| "grad_norm": 24.630603790283203, |
| "learning_rate": 3.203058325402599e-06, |
| "loss": 4.3332, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.6315630429855708, |
| "grad_norm": 16.667922973632812, |
| "learning_rate": 3.1574658759103904e-06, |
| "loss": 4.3038, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.6345848757271285, |
| "grad_norm": 20.671772003173828, |
| "learning_rate": 3.1120500189700204e-06, |
| "loss": 3.4132, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6376067084686863, |
| "grad_norm": 21.932987213134766, |
| "learning_rate": 3.066815107337815e-06, |
| "loss": 4.1988, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.640628541210244, |
| "grad_norm": 17.348411560058594, |
| "learning_rate": 3.0217654764279114e-06, |
| "loss": 3.5937, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.6436503739518018, |
| "grad_norm": 25.625871658325195, |
| "learning_rate": 2.9769054438967192e-06, |
| "loss": 5.9817, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.6466722066933596, |
| "grad_norm": 31.0660457611084, |
| "learning_rate": 2.9322393092291256e-06, |
| "loss": 5.6772, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.6496940394349173, |
| "grad_norm": 20.511960983276367, |
| "learning_rate": 2.887771353326422e-06, |
| "loss": 4.2915, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6527158721764751, |
| "grad_norm": 17.798234939575195, |
| "learning_rate": 2.8435058380959957e-06, |
| "loss": 2.642, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.6557377049180327, |
| "grad_norm": 18.133886337280273, |
| "learning_rate": 2.7994470060428835e-06, |
| "loss": 4.1208, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.6587595376595905, |
| "grad_norm": 18.74016571044922, |
| "learning_rate": 2.7555990798631436e-06, |
| "loss": 4.8817, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.6617813704011483, |
| "grad_norm": 15.885804176330566, |
| "learning_rate": 2.711966262039145e-06, |
| "loss": 3.3242, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.664803203142706, |
| "grad_norm": 24.100414276123047, |
| "learning_rate": 2.668552734436802e-06, |
| "loss": 4.3377, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6678250358842638, |
| "grad_norm": 17.113306045532227, |
| "learning_rate": 2.6253626579047653e-06, |
| "loss": 5.7855, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.6708468686258215, |
| "grad_norm": 33.268699645996094, |
| "learning_rate": 2.582400171875638e-06, |
| "loss": 3.4326, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.6738687013673793, |
| "grad_norm": 29.673768997192383, |
| "learning_rate": 2.5396693939692474e-06, |
| "loss": 4.8596, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.6768905341089371, |
| "grad_norm": 14.550185203552246, |
| "learning_rate": 2.4971744195979985e-06, |
| "loss": 5.1031, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.6799123668504948, |
| "grad_norm": 32.16508102416992, |
| "learning_rate": 2.4549193215743706e-06, |
| "loss": 5.833, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6829341995920526, |
| "grad_norm": 18.873088836669922, |
| "learning_rate": 2.4129081497205536e-06, |
| "loss": 3.3544, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.6859560323336104, |
| "grad_norm": 31.875137329101562, |
| "learning_rate": 2.3711449304803174e-06, |
| "loss": 4.0864, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.6889778650751681, |
| "grad_norm": 27.996572494506836, |
| "learning_rate": 2.329633666533103e-06, |
| "loss": 4.0582, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.6919996978167259, |
| "grad_norm": 19.299062728881836, |
| "learning_rate": 2.288378336410398e-06, |
| "loss": 4.2188, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.6950215305582836, |
| "grad_norm": 21.146148681640625, |
| "learning_rate": 2.2473828941144277e-06, |
| "loss": 4.8756, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6980433632998414, |
| "grad_norm": 28.3226261138916, |
| "learning_rate": 2.20665126873919e-06, |
| "loss": 3.3593, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.7010651960413992, |
| "grad_norm": 16.02470588684082, |
| "learning_rate": 2.1661873640938818e-06, |
| "loss": 4.1255, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.7040870287829568, |
| "grad_norm": 21.263837814331055, |
| "learning_rate": 2.1259950583287633e-06, |
| "loss": 4.145, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.7071088615245146, |
| "grad_norm": 22.879661560058594, |
| "learning_rate": 2.086078203563439e-06, |
| "loss": 4.7453, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.7101306942660723, |
| "grad_norm": 15.726652145385742, |
| "learning_rate": 2.0464406255176967e-06, |
| "loss": 4.019, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.7131525270076301, |
| "grad_norm": 30.606904983520508, |
| "learning_rate": 2.0070861231448142e-06, |
| "loss": 4.9014, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.7161743597491879, |
| "grad_norm": 17.185054779052734, |
| "learning_rate": 1.968018468267472e-06, |
| "loss": 4.1918, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.7191961924907456, |
| "grad_norm": 15.510167121887207, |
| "learning_rate": 1.929241405216254e-06, |
| "loss": 4.0934, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.7222180252323034, |
| "grad_norm": 20.12055206298828, |
| "learning_rate": 1.8907586504707776e-06, |
| "loss": 4.701, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.7252398579738611, |
| "grad_norm": 19.135282516479492, |
| "learning_rate": 1.8525738923035002e-06, |
| "loss": 2.5439, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7282616907154189, |
| "grad_norm": 19.167003631591797, |
| "learning_rate": 1.8146907904262268e-06, |
| "loss": 4.2791, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.7312835234569767, |
| "grad_norm": 24.79986572265625, |
| "learning_rate": 1.7771129756393545e-06, |
| "loss": 3.4256, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.7343053561985344, |
| "grad_norm": 20.59393310546875, |
| "learning_rate": 1.7398440494838947e-06, |
| "loss": 3.5206, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.7373271889400922, |
| "grad_norm": 25.903627395629883, |
| "learning_rate": 1.7028875838962822e-06, |
| "loss": 4.1281, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.74034902168165, |
| "grad_norm": 35.45489501953125, |
| "learning_rate": 1.6662471208660392e-06, |
| "loss": 4.0468, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7433708544232077, |
| "grad_norm": 20.3117618560791, |
| "learning_rate": 1.6299261720963095e-06, |
| "loss": 4.1749, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.7463926871647655, |
| "grad_norm": 15.878867149353027, |
| "learning_rate": 1.5939282186672705e-06, |
| "loss": 4.8916, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.7494145199063232, |
| "grad_norm": 19.15277099609375, |
| "learning_rate": 1.5582567107025237e-06, |
| "loss": 4.8288, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.7524363526478809, |
| "grad_norm": 29.44374656677246, |
| "learning_rate": 1.5229150670384057e-06, |
| "loss": 3.3806, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.7554581853894387, |
| "grad_norm": 23.206140518188477, |
| "learning_rate": 1.4879066748963295e-06, |
| "loss": 2.5563, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7584800181309964, |
| "grad_norm": 27.133193969726562, |
| "learning_rate": 1.4532348895581466e-06, |
| "loss": 3.4434, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.7615018508725542, |
| "grad_norm": 29.599319458007812, |
| "learning_rate": 1.4189030340445648e-06, |
| "loss": 6.7087, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.7645236836141119, |
| "grad_norm": 17.123348236083984, |
| "learning_rate": 1.3849143987966646e-06, |
| "loss": 4.9595, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.7675455163556697, |
| "grad_norm": 16.49233627319336, |
| "learning_rate": 1.3512722413605356e-06, |
| "loss": 4.0857, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.7705673490972275, |
| "grad_norm": 16.6666316986084, |
| "learning_rate": 1.3179797860750654e-06, |
| "loss": 4.8943, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7735891818387852, |
| "grad_norm": 19.440494537353516, |
| "learning_rate": 1.2850402237629184e-06, |
| "loss": 4.1448, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.776611014580343, |
| "grad_norm": 14.674943923950195, |
| "learning_rate": 1.2524567114247083e-06, |
| "loss": 3.3491, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.7796328473219007, |
| "grad_norm": 16.349637985229492, |
| "learning_rate": 1.2202323719364324e-06, |
| "loss": 3.2897, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.7826546800634585, |
| "grad_norm": 19.67890739440918, |
| "learning_rate": 1.1883702937501708e-06, |
| "loss": 4.0901, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.7856765128050163, |
| "grad_norm": 21.339618682861328, |
| "learning_rate": 1.1568735305980694e-06, |
| "loss": 4.1003, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.788698345546574, |
| "grad_norm": 21.269119262695312, |
| "learning_rate": 1.1257451011996807e-06, |
| "loss": 3.4165, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.7917201782881318, |
| "grad_norm": 33.041419982910156, |
| "learning_rate": 1.0949879889726295e-06, |
| "loss": 3.4622, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.7947420110296896, |
| "grad_norm": 28.960115432739258, |
| "learning_rate": 1.0646051417466801e-06, |
| "loss": 3.4136, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.7977638437712473, |
| "grad_norm": 24.76239013671875, |
| "learning_rate": 1.0345994714812135e-06, |
| "loss": 4.1335, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.800785676512805, |
| "grad_norm": 15.773963928222656, |
| "learning_rate": 1.0049738539861332e-06, |
| "loss": 3.2818, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.8038075092543627, |
| "grad_norm": 21.248395919799805, |
| "learning_rate": 9.757311286462428e-07, |
| "loss": 4.1348, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.8068293419959205, |
| "grad_norm": 23.75290298461914, |
| "learning_rate": 9.468740981491143e-07, |
| "loss": 4.1947, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.8098511747374783, |
| "grad_norm": 16.7280330657959, |
| "learning_rate": 9.1840552821647e-07, |
| "loss": 4.0364, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.812873007479036, |
| "grad_norm": 17.696247100830078, |
| "learning_rate": 8.903281473391152e-07, |
| "loss": 3.3641, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.8158948402205938, |
| "grad_norm": 16.840299606323242, |
| "learning_rate": 8.62644646515427e-07, |
| "loss": 5.7446, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8189166729621515, |
| "grad_norm": 13.25534725189209, |
| "learning_rate": 8.353576789934436e-07, |
| "loss": 3.3763, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.8219385057037093, |
| "grad_norm": 19.88932991027832, |
| "learning_rate": 8.084698600165797e-07, |
| "loss": 3.5133, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.8249603384452671, |
| "grad_norm": 17.921199798583984, |
| "learning_rate": 7.819837665729596e-07, |
| "loss": 4.1018, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.8279821711868248, |
| "grad_norm": 29.57664680480957, |
| "learning_rate": 7.559019371484521e-07, |
| "loss": 3.3378, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.8310040039283826, |
| "grad_norm": 17.720863342285156, |
| "learning_rate": 7.302268714833622e-07, |
| "loss": 4.1487, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8340258366699403, |
| "grad_norm": 17.34684944152832, |
| "learning_rate": 7.049610303328541e-07, |
| "loss": 3.5199, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.8370476694114981, |
| "grad_norm": 16.739910125732422, |
| "learning_rate": 6.80106835231113e-07, |
| "loss": 4.2899, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.8400695021530559, |
| "grad_norm": 17.1294002532959, |
| "learning_rate": 6.556666682592494e-07, |
| "loss": 3.3016, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.8430913348946136, |
| "grad_norm": 14.801079750061035, |
| "learning_rate": 6.316428718170037e-07, |
| "loss": 2.4169, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.8461131676361714, |
| "grad_norm": 19.354856491088867, |
| "learning_rate": 6.080377483982425e-07, |
| "loss": 3.2883, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.849135000377729, |
| "grad_norm": 17.925838470458984, |
| "learning_rate": 5.848535603702798e-07, |
| "loss": 3.3497, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.8521568331192868, |
| "grad_norm": 20.340959548950195, |
| "learning_rate": 5.62092529757054e-07, |
| "loss": 6.4132, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.8551786658608446, |
| "grad_norm": 21.507797241210938, |
| "learning_rate": 5.397568380261559e-07, |
| "loss": 2.3404, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.8582004986024023, |
| "grad_norm": 16.9514102935791, |
| "learning_rate": 5.178486258797555e-07, |
| "loss": 4.0876, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.8612223313439601, |
| "grad_norm": 14.505171775817871, |
| "learning_rate": 4.963699930494365e-07, |
| "loss": 3.3715, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.8642441640855179, |
| "grad_norm": 22.551313400268555, |
| "learning_rate": 4.75322998094942e-07, |
| "loss": 4.2347, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.8672659968270756, |
| "grad_norm": 20.145078659057617, |
| "learning_rate": 4.5470965820689384e-07, |
| "loss": 2.5903, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.8702878295686334, |
| "grad_norm": 17.447914123535156, |
| "learning_rate": 4.345319490134453e-07, |
| "loss": 3.0177, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.8733096623101911, |
| "grad_norm": 16.10365104675293, |
| "learning_rate": 4.147918043909405e-07, |
| "loss": 4.764, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.8763314950517489, |
| "grad_norm": 19.066129684448242, |
| "learning_rate": 3.9549111627856794e-07, |
| "loss": 4.7699, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8793533277933067, |
| "grad_norm": 19.604887008666992, |
| "learning_rate": 3.766317344970288e-07, |
| "loss": 4.1165, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.8823751605348644, |
| "grad_norm": 17.465734481811523, |
| "learning_rate": 3.582154665712473e-07, |
| "loss": 2.4443, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.8853969932764222, |
| "grad_norm": 22.400236129760742, |
| "learning_rate": 3.402440775571364e-07, |
| "loss": 4.0664, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.88841882601798, |
| "grad_norm": 21.420312881469727, |
| "learning_rate": 3.227192898724252e-07, |
| "loss": 5.7203, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.8914406587595377, |
| "grad_norm": 23.331478118896484, |
| "learning_rate": 3.056427831315878e-07, |
| "loss": 3.367, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8944624915010955, |
| "grad_norm": 21.29648208618164, |
| "learning_rate": 2.890161939848535e-07, |
| "loss": 4.1604, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.8974843242426531, |
| "grad_norm": 15.172201156616211, |
| "learning_rate": 2.72841115961357e-07, |
| "loss": 4.2335, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.9005061569842109, |
| "grad_norm": 16.736038208007812, |
| "learning_rate": 2.5711909931640633e-07, |
| "loss": 3.9793, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.9035279897257686, |
| "grad_norm": 22.6779727935791, |
| "learning_rate": 2.418516508829e-07, |
| "loss": 2.4922, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.9065498224673264, |
| "grad_norm": 32.2912712097168, |
| "learning_rate": 2.270402339269162e-07, |
| "loss": 5.6454, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9095716552088842, |
| "grad_norm": 18.107574462890625, |
| "learning_rate": 2.126862680074643e-07, |
| "loss": 5.0056, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.9125934879504419, |
| "grad_norm": 32.63033676147461, |
| "learning_rate": 1.9879112884043317e-07, |
| "loss": 2.5369, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.9156153206919997, |
| "grad_norm": 18.089956283569336, |
| "learning_rate": 1.853561481667404e-07, |
| "loss": 2.4556, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.9186371534335575, |
| "grad_norm": 13.772138595581055, |
| "learning_rate": 1.7238261362469256e-07, |
| "loss": 3.0884, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.9216589861751152, |
| "grad_norm": 22.537776947021484, |
| "learning_rate": 1.5987176862657883e-07, |
| "loss": 3.2805, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.924680818916673, |
| "grad_norm": 30.13243865966797, |
| "learning_rate": 1.4782481223949597e-07, |
| "loss": 3.2507, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.9277026516582307, |
| "grad_norm": 20.858510971069336, |
| "learning_rate": 1.3624289907042787e-07, |
| "loss": 4.1981, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.9307244843997885, |
| "grad_norm": 30.669658660888672, |
| "learning_rate": 1.2512713915559027e-07, |
| "loss": 4.9341, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.9337463171413463, |
| "grad_norm": 32.03891372680664, |
| "learning_rate": 1.1447859785403359e-07, |
| "loss": 4.8266, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.936768149882904, |
| "grad_norm": 18.382429122924805, |
| "learning_rate": 1.0429829574554573e-07, |
| "loss": 3.4044, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9397899826244618, |
| "grad_norm": 16.341550827026367, |
| "learning_rate": 9.458720853282977e-08, |
| "loss": 4.1438, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.9428118153660195, |
| "grad_norm": 32.575286865234375, |
| "learning_rate": 8.534626694799485e-08, |
| "loss": 5.6917, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.9458336481075772, |
| "grad_norm": 19.515989303588867, |
| "learning_rate": 7.657635666335317e-08, |
| "loss": 2.5437, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.948855480849135, |
| "grad_norm": 18.81734275817871, |
| "learning_rate": 6.827831820653163e-08, |
| "loss": 2.5297, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.9518773135906927, |
| "grad_norm": 20.44892120361328, |
| "learning_rate": 6.045294687991643e-08, |
| "loss": 5.3046, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9548991463322505, |
| "grad_norm": 30.222261428833008, |
| "learning_rate": 5.310099268443114e-08, |
| "loss": 7.1585, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.9579209790738082, |
| "grad_norm": 22.93487548828125, |
| "learning_rate": 4.622316024765039e-08, |
| "loss": 3.9296, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.960942811815366, |
| "grad_norm": 20.129398345947266, |
| "learning_rate": 3.982010875626885e-08, |
| "loss": 3.2971, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.9639646445569238, |
| "grad_norm": 20.64815330505371, |
| "learning_rate": 3.389245189292622e-08, |
| "loss": 4.1501, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.9669864772984815, |
| "grad_norm": 19.435129165649414, |
| "learning_rate": 2.8440757777385976e-08, |
| "loss": 4.9552, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9700083100400393, |
| "grad_norm": 17.719867706298828, |
| "learning_rate": 2.3465548912088298e-08, |
| "loss": 2.6329, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.973030142781597, |
| "grad_norm": 21.178937911987305, |
| "learning_rate": 1.896730213207132e-08, |
| "loss": 4.0836, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.9760519755231548, |
| "grad_norm": 16.906330108642578, |
| "learning_rate": 1.4946448559270964e-08, |
| "loss": 2.397, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.9790738082647126, |
| "grad_norm": 23.301292419433594, |
| "learning_rate": 1.1403373561199583e-08, |
| "loss": 4.2365, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.9820956410062703, |
| "grad_norm": 20.07245635986328, |
| "learning_rate": 8.338416714013254e-09, |
| "loss": 3.444, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9851174737478281, |
| "grad_norm": 16.27911949157715, |
| "learning_rate": 5.751871769965056e-09, |
| "loss": 5.5038, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.9881393064893859, |
| "grad_norm": 21.404827117919922, |
| "learning_rate": 3.643986629253138e-09, |
| "loss": 4.1734, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.9911611392309436, |
| "grad_norm": 32.63972473144531, |
| "learning_rate": 2.014963316257501e-09, |
| "loss": 4.9837, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.9941829719725013, |
| "grad_norm": 19.831165313720703, |
| "learning_rate": 8.649579601810454e-10, |
| "loss": 4.9867, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.997204804714059, |
| "grad_norm": 21.53673553466797, |
| "learning_rate": 1.9408078008431587e-10, |
| "loss": 3.3738, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9996222709073053, |
| "step": 3308, |
| "total_flos": 1.175877708593234e+19, |
| "train_loss": 4.4771003486744005, |
| "train_runtime": 52781.7624, |
| "train_samples_per_second": 4.013, |
| "train_steps_per_second": 0.063 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3309, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.175877708593234e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|