{ "best_metric": 0.4919604957103729, "best_model_checkpoint": "./vit-lr-linear/checkpoint-800", "epoch": 5.607476635514018, "eval_steps": 100, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 3.5967516899108887, "learning_rate": 9.997196261682243e-05, "loss": 1.2508, "step": 10 }, { "epoch": 0.06, "grad_norm": 4.702871799468994, "learning_rate": 9.994080996884736e-05, "loss": 0.7499, "step": 20 }, { "epoch": 0.09, "grad_norm": 6.171701431274414, "learning_rate": 9.990965732087227e-05, "loss": 0.8834, "step": 30 }, { "epoch": 0.12, "grad_norm": 10.749275207519531, "learning_rate": 9.98785046728972e-05, "loss": 1.0502, "step": 40 }, { "epoch": 0.16, "grad_norm": 4.334569931030273, "learning_rate": 9.984735202492212e-05, "loss": 0.769, "step": 50 }, { "epoch": 0.19, "grad_norm": 4.809230327606201, "learning_rate": 9.981619937694705e-05, "loss": 0.9034, "step": 60 }, { "epoch": 0.22, "grad_norm": 3.983206272125244, "learning_rate": 9.978504672897196e-05, "loss": 0.8274, "step": 70 }, { "epoch": 0.25, "grad_norm": 3.574300765991211, "learning_rate": 9.975389408099689e-05, "loss": 0.6348, "step": 80 }, { "epoch": 0.28, "grad_norm": 5.728092670440674, "learning_rate": 9.972274143302182e-05, "loss": 0.6568, "step": 90 }, { "epoch": 0.31, "grad_norm": 4.649799823760986, "learning_rate": 9.969158878504672e-05, "loss": 0.6029, "step": 100 }, { "epoch": 0.31, "eval_accuracy": 0.7805131761442441, "eval_f1": 0.7528727438506023, "eval_loss": 0.6126354932785034, "eval_precision": 0.7602026831090402, "eval_recall": 0.7805131761442441, "eval_runtime": 39.2505, "eval_samples_per_second": 73.477, "eval_steps_per_second": 9.197, "step": 100 }, { "epoch": 0.34, "grad_norm": 5.011153221130371, "learning_rate": 9.966043613707165e-05, "loss": 0.6847, "step": 110 }, { "epoch": 0.37, "grad_norm": 7.656059741973877, "learning_rate": 9.962928348909658e-05, "loss": 0.6162, "step": 120 }, { "epoch": 0.4, "grad_norm": 7.763223648071289, "learning_rate": 9.95981308411215e-05, "loss": 0.6389, "step": 130 }, { "epoch": 0.44, "grad_norm": 2.2059271335601807, "learning_rate": 9.956697819314643e-05, "loss": 0.5381, "step": 140 }, { "epoch": 0.47, "grad_norm": 6.06673002243042, "learning_rate": 9.953582554517134e-05, "loss": 0.5674, "step": 150 }, { "epoch": 0.5, "grad_norm": 5.407893657684326, "learning_rate": 9.950467289719627e-05, "loss": 0.5598, "step": 160 }, { "epoch": 0.53, "grad_norm": 7.515843391418457, "learning_rate": 9.947352024922119e-05, "loss": 0.5488, "step": 170 }, { "epoch": 0.56, "grad_norm": 7.197587966918945, "learning_rate": 9.944236760124612e-05, "loss": 0.7078, "step": 180 }, { "epoch": 0.59, "grad_norm": 4.026732921600342, "learning_rate": 9.941121495327103e-05, "loss": 0.595, "step": 190 }, { "epoch": 0.62, "grad_norm": 5.807314872741699, "learning_rate": 9.938006230529595e-05, "loss": 0.5726, "step": 200 }, { "epoch": 0.62, "eval_accuracy": 0.7649098474341193, "eval_f1": 0.7177347277722194, "eval_loss": 0.6950196623802185, "eval_precision": 0.7613231169607695, "eval_recall": 0.7649098474341193, "eval_runtime": 39.3588, "eval_samples_per_second": 73.275, "eval_steps_per_second": 9.172, "step": 200 }, { "epoch": 0.65, "grad_norm": 5.1864399909973145, "learning_rate": 9.934890965732088e-05, "loss": 0.6681, "step": 210 }, { "epoch": 0.69, "grad_norm": 7.364803314208984, "learning_rate": 9.93177570093458e-05, "loss": 0.7502, "step": 220 }, { "epoch": 0.72, "grad_norm": 1.9555000066757202, "learning_rate": 9.928660436137072e-05, "loss": 0.3976, "step": 230 }, { "epoch": 0.75, "grad_norm": 2.5017478466033936, "learning_rate": 9.925545171339564e-05, "loss": 0.5505, "step": 240 }, { "epoch": 0.78, "grad_norm": 4.633613109588623, "learning_rate": 9.922429906542056e-05, "loss": 0.7452, "step": 250 }, { "epoch": 0.81, "grad_norm": 4.068040370941162, "learning_rate": 9.91931464174455e-05, "loss": 0.5837, "step": 260 }, { "epoch": 0.84, "grad_norm": 3.191166639328003, "learning_rate": 9.916199376947041e-05, "loss": 0.4488, "step": 270 }, { "epoch": 0.87, "grad_norm": 2.8749473094940186, "learning_rate": 9.913084112149534e-05, "loss": 0.4906, "step": 280 }, { "epoch": 0.9, "grad_norm": 7.547589302062988, "learning_rate": 9.909968847352025e-05, "loss": 0.6191, "step": 290 }, { "epoch": 0.93, "grad_norm": 5.1279096603393555, "learning_rate": 9.906853582554517e-05, "loss": 0.6521, "step": 300 }, { "epoch": 0.93, "eval_accuracy": 0.8124133148404993, "eval_f1": 0.8060484287452215, "eval_loss": 0.5102406740188599, "eval_precision": 0.8148843275891411, "eval_recall": 0.8124133148404993, "eval_runtime": 39.7509, "eval_samples_per_second": 72.552, "eval_steps_per_second": 9.082, "step": 300 }, { "epoch": 0.97, "grad_norm": 3.052353858947754, "learning_rate": 9.90373831775701e-05, "loss": 0.5616, "step": 310 }, { "epoch": 1.0, "grad_norm": 4.507020473480225, "learning_rate": 9.900623052959503e-05, "loss": 0.5192, "step": 320 }, { "epoch": 1.03, "grad_norm": 5.176488876342773, "learning_rate": 9.897507788161994e-05, "loss": 0.4461, "step": 330 }, { "epoch": 1.06, "grad_norm": 4.797460079193115, "learning_rate": 9.894392523364486e-05, "loss": 0.3651, "step": 340 }, { "epoch": 1.09, "grad_norm": 2.186629295349121, "learning_rate": 9.891277258566979e-05, "loss": 0.4004, "step": 350 }, { "epoch": 1.12, "grad_norm": 2.1874918937683105, "learning_rate": 9.888161993769472e-05, "loss": 0.3475, "step": 360 }, { "epoch": 1.15, "grad_norm": 4.8143486976623535, "learning_rate": 9.885046728971963e-05, "loss": 0.4073, "step": 370 }, { "epoch": 1.18, "grad_norm": 7.921704292297363, "learning_rate": 9.881931464174455e-05, "loss": 0.3765, "step": 380 }, { "epoch": 1.21, "grad_norm": 5.491533279418945, "learning_rate": 9.878816199376948e-05, "loss": 0.4707, "step": 390 }, { "epoch": 1.25, "grad_norm": 2.563472032546997, "learning_rate": 9.875700934579439e-05, "loss": 0.3803, "step": 400 }, { "epoch": 1.25, "eval_accuracy": 0.7843273231622746, "eval_f1": 0.7933752302092828, "eval_loss": 0.6124634146690369, "eval_precision": 0.8128379357321871, "eval_recall": 0.7843273231622746, "eval_runtime": 39.1164, "eval_samples_per_second": 73.729, "eval_steps_per_second": 9.229, "step": 400 }, { "epoch": 1.28, "grad_norm": 3.978813648223877, "learning_rate": 9.872585669781932e-05, "loss": 0.4527, "step": 410 }, { "epoch": 1.31, "grad_norm": 3.3713996410369873, "learning_rate": 9.869470404984425e-05, "loss": 0.4344, "step": 420 }, { "epoch": 1.34, "grad_norm": 3.685149669647217, "learning_rate": 9.866355140186917e-05, "loss": 0.5163, "step": 430 }, { "epoch": 1.37, "grad_norm": 3.9468307495117188, "learning_rate": 9.863239875389408e-05, "loss": 0.4187, "step": 440 }, { "epoch": 1.4, "grad_norm": 4.608047962188721, "learning_rate": 9.860124610591901e-05, "loss": 0.3643, "step": 450 }, { "epoch": 1.43, "grad_norm": 3.3221333026885986, "learning_rate": 9.857009345794394e-05, "loss": 0.3324, "step": 460 }, { "epoch": 1.46, "grad_norm": 3.251314640045166, "learning_rate": 9.853894080996885e-05, "loss": 0.3229, "step": 470 }, { "epoch": 1.5, "grad_norm": 7.8897552490234375, "learning_rate": 9.850778816199377e-05, "loss": 0.3684, "step": 480 }, { "epoch": 1.53, "grad_norm": 5.32474422454834, "learning_rate": 9.84766355140187e-05, "loss": 0.3567, "step": 490 }, { "epoch": 1.56, "grad_norm": 2.955794334411621, "learning_rate": 9.844548286604361e-05, "loss": 0.4048, "step": 500 }, { "epoch": 1.56, "eval_accuracy": 0.8214285714285714, "eval_f1": 0.80775004871548, "eval_loss": 0.5058895349502563, "eval_precision": 0.8156080811716645, "eval_recall": 0.8214285714285714, "eval_runtime": 39.0979, "eval_samples_per_second": 73.764, "eval_steps_per_second": 9.233, "step": 500 }, { "epoch": 1.59, "grad_norm": 3.655855894088745, "learning_rate": 9.841433021806854e-05, "loss": 0.4464, "step": 510 }, { "epoch": 1.62, "grad_norm": 3.6523778438568115, "learning_rate": 9.838317757009346e-05, "loss": 0.4154, "step": 520 }, { "epoch": 1.65, "grad_norm": 6.649527072906494, "learning_rate": 9.835202492211837e-05, "loss": 0.4337, "step": 530 }, { "epoch": 1.68, "grad_norm": 4.308875560760498, "learning_rate": 9.83208722741433e-05, "loss": 0.3551, "step": 540 }, { "epoch": 1.71, "grad_norm": 5.290976047515869, "learning_rate": 9.828971962616823e-05, "loss": 0.4329, "step": 550 }, { "epoch": 1.74, "grad_norm": 3.688100814819336, "learning_rate": 9.825856697819316e-05, "loss": 0.4306, "step": 560 }, { "epoch": 1.78, "grad_norm": 2.4794669151306152, "learning_rate": 9.822741433021808e-05, "loss": 0.3281, "step": 570 }, { "epoch": 1.81, "grad_norm": 3.5106523036956787, "learning_rate": 9.819626168224299e-05, "loss": 0.3184, "step": 580 }, { "epoch": 1.84, "grad_norm": 5.135136604309082, "learning_rate": 9.816510903426792e-05, "loss": 0.3644, "step": 590 }, { "epoch": 1.87, "grad_norm": 3.2593114376068115, "learning_rate": 9.813395638629284e-05, "loss": 0.2939, "step": 600 }, { "epoch": 1.87, "eval_accuracy": 0.7680305131761442, "eval_f1": 0.7817959651329371, "eval_loss": 0.6723023653030396, "eval_precision": 0.8366418813297796, "eval_recall": 0.7680305131761442, "eval_runtime": 39.3089, "eval_samples_per_second": 73.368, "eval_steps_per_second": 9.184, "step": 600 }, { "epoch": 1.9, "grad_norm": 7.719142913818359, "learning_rate": 9.810280373831777e-05, "loss": 0.5629, "step": 610 }, { "epoch": 1.93, "grad_norm": 4.130067348480225, "learning_rate": 9.807165109034268e-05, "loss": 0.4416, "step": 620 }, { "epoch": 1.96, "grad_norm": 1.9364572763442993, "learning_rate": 9.80404984423676e-05, "loss": 0.2339, "step": 630 }, { "epoch": 1.99, "grad_norm": 2.4033007621765137, "learning_rate": 9.800934579439253e-05, "loss": 0.3284, "step": 640 }, { "epoch": 2.02, "grad_norm": 3.25533390045166, "learning_rate": 9.797819314641746e-05, "loss": 0.2307, "step": 650 }, { "epoch": 2.06, "grad_norm": 3.1277670860290527, "learning_rate": 9.794704049844237e-05, "loss": 0.2063, "step": 660 }, { "epoch": 2.09, "grad_norm": 2.923804521560669, "learning_rate": 9.791588785046729e-05, "loss": 0.1563, "step": 670 }, { "epoch": 2.12, "grad_norm": 4.745158672332764, "learning_rate": 9.788473520249222e-05, "loss": 0.2451, "step": 680 }, { "epoch": 2.15, "grad_norm": 5.691440582275391, "learning_rate": 9.785358255451714e-05, "loss": 0.2315, "step": 690 }, { "epoch": 2.18, "grad_norm": 7.0971550941467285, "learning_rate": 9.782242990654206e-05, "loss": 0.2138, "step": 700 }, { "epoch": 2.18, "eval_accuracy": 0.812760055478502, "eval_f1": 0.8169834700772869, "eval_loss": 0.635110080242157, "eval_precision": 0.8480203726240728, "eval_recall": 0.812760055478502, "eval_runtime": 39.4295, "eval_samples_per_second": 73.143, "eval_steps_per_second": 9.156, "step": 700 }, { "epoch": 2.21, "grad_norm": 2.342768430709839, "learning_rate": 9.779127725856699e-05, "loss": 0.2763, "step": 710 }, { "epoch": 2.24, "grad_norm": 4.194737434387207, "learning_rate": 9.77601246105919e-05, "loss": 0.3159, "step": 720 }, { "epoch": 2.27, "grad_norm": 4.488862991333008, "learning_rate": 9.772897196261682e-05, "loss": 0.2594, "step": 730 }, { "epoch": 2.31, "grad_norm": 3.612229347229004, "learning_rate": 9.769781931464175e-05, "loss": 0.1651, "step": 740 }, { "epoch": 2.34, "grad_norm": 4.490918159484863, "learning_rate": 9.766666666666668e-05, "loss": 0.1714, "step": 750 }, { "epoch": 2.37, "grad_norm": 1.2064989805221558, "learning_rate": 9.76355140186916e-05, "loss": 0.1423, "step": 760 }, { "epoch": 2.4, "grad_norm": 3.8275320529937744, "learning_rate": 9.760436137071651e-05, "loss": 0.2647, "step": 770 }, { "epoch": 2.43, "grad_norm": 6.258784294128418, "learning_rate": 9.757320872274144e-05, "loss": 0.1925, "step": 780 }, { "epoch": 2.46, "grad_norm": 6.503872394561768, "learning_rate": 9.754205607476637e-05, "loss": 0.3565, "step": 790 }, { "epoch": 2.49, "grad_norm": 4.768139362335205, "learning_rate": 9.751090342679128e-05, "loss": 0.2615, "step": 800 }, { "epoch": 2.49, "eval_accuracy": 0.8321775312066574, "eval_f1": 0.8323189306799796, "eval_loss": 0.4919604957103729, "eval_precision": 0.8400335349095535, "eval_recall": 0.8321775312066574, "eval_runtime": 39.8278, "eval_samples_per_second": 72.412, "eval_steps_per_second": 9.064, "step": 800 }, { "epoch": 2.52, "grad_norm": 3.3493497371673584, "learning_rate": 9.74797507788162e-05, "loss": 0.179, "step": 810 }, { "epoch": 2.55, "grad_norm": 6.209288597106934, "learning_rate": 9.744859813084113e-05, "loss": 0.1872, "step": 820 }, { "epoch": 2.59, "grad_norm": 3.6066982746124268, "learning_rate": 9.741744548286604e-05, "loss": 0.24, "step": 830 }, { "epoch": 2.62, "grad_norm": 2.330963611602783, "learning_rate": 9.738629283489097e-05, "loss": 0.177, "step": 840 }, { "epoch": 2.65, "grad_norm": 0.31879034638404846, "learning_rate": 9.73551401869159e-05, "loss": 0.1846, "step": 850 }, { "epoch": 2.68, "grad_norm": 6.56818151473999, "learning_rate": 9.732398753894082e-05, "loss": 0.2731, "step": 860 }, { "epoch": 2.71, "grad_norm": 1.255943775177002, "learning_rate": 9.729283489096573e-05, "loss": 0.3217, "step": 870 }, { "epoch": 2.74, "grad_norm": 5.009977340698242, "learning_rate": 9.726168224299066e-05, "loss": 0.2576, "step": 880 }, { "epoch": 2.77, "grad_norm": 1.8944584131240845, "learning_rate": 9.723052959501559e-05, "loss": 0.2223, "step": 890 }, { "epoch": 2.8, "grad_norm": 3.543699026107788, "learning_rate": 9.71993769470405e-05, "loss": 0.2125, "step": 900 }, { "epoch": 2.8, "eval_accuracy": 0.8491678224687933, "eval_f1": 0.843224216265628, "eval_loss": 0.5596445202827454, "eval_precision": 0.8508841434558159, "eval_recall": 0.8491678224687933, "eval_runtime": 39.2828, "eval_samples_per_second": 73.416, "eval_steps_per_second": 9.19, "step": 900 }, { "epoch": 2.83, "grad_norm": 5.571394920349121, "learning_rate": 9.716822429906542e-05, "loss": 0.2826, "step": 910 }, { "epoch": 2.87, "grad_norm": 5.057092666625977, "learning_rate": 9.713707165109035e-05, "loss": 0.2647, "step": 920 }, { "epoch": 2.9, "grad_norm": 3.190361738204956, "learning_rate": 9.710591900311527e-05, "loss": 0.4015, "step": 930 }, { "epoch": 2.93, "grad_norm": 6.205695629119873, "learning_rate": 9.70747663551402e-05, "loss": 0.312, "step": 940 }, { "epoch": 2.96, "grad_norm": 4.805609226226807, "learning_rate": 9.704361370716511e-05, "loss": 0.2098, "step": 950 }, { "epoch": 2.99, "grad_norm": 3.2721455097198486, "learning_rate": 9.701246105919004e-05, "loss": 0.1959, "step": 960 }, { "epoch": 3.02, "grad_norm": 4.190710067749023, "learning_rate": 9.698130841121495e-05, "loss": 0.1048, "step": 970 }, { "epoch": 3.05, "grad_norm": 0.5176565051078796, "learning_rate": 9.695015576323988e-05, "loss": 0.0816, "step": 980 }, { "epoch": 3.08, "grad_norm": 0.1512940376996994, "learning_rate": 9.691900311526481e-05, "loss": 0.0327, "step": 990 }, { "epoch": 3.12, "grad_norm": 5.473361015319824, "learning_rate": 9.688785046728971e-05, "loss": 0.0768, "step": 1000 }, { "epoch": 3.12, "eval_accuracy": 0.8290568654646324, "eval_f1": 0.8235345080050543, "eval_loss": 0.823904275894165, "eval_precision": 0.8499722083747269, "eval_recall": 0.8290568654646324, "eval_runtime": 39.6039, "eval_samples_per_second": 72.821, "eval_steps_per_second": 9.115, "step": 1000 }, { "epoch": 3.15, "grad_norm": 2.461423635482788, "learning_rate": 9.685669781931464e-05, "loss": 0.1258, "step": 1010 }, { "epoch": 3.18, "grad_norm": 5.284664154052734, "learning_rate": 9.682554517133957e-05, "loss": 0.1124, "step": 1020 }, { "epoch": 3.21, "grad_norm": 1.3465226888656616, "learning_rate": 9.679439252336449e-05, "loss": 0.0986, "step": 1030 }, { "epoch": 3.24, "grad_norm": 3.916722297668457, "learning_rate": 9.676323987538942e-05, "loss": 0.0541, "step": 1040 }, { "epoch": 3.27, "grad_norm": 0.6409209966659546, "learning_rate": 9.673208722741433e-05, "loss": 0.1176, "step": 1050 }, { "epoch": 3.3, "grad_norm": 4.325451374053955, "learning_rate": 9.670093457943926e-05, "loss": 0.2169, "step": 1060 }, { "epoch": 3.33, "grad_norm": 0.5127350687980652, "learning_rate": 9.666978193146418e-05, "loss": 0.0739, "step": 1070 }, { "epoch": 3.36, "grad_norm": 7.527690410614014, "learning_rate": 9.66386292834891e-05, "loss": 0.184, "step": 1080 }, { "epoch": 3.4, "grad_norm": 2.938415765762329, "learning_rate": 9.660747663551402e-05, "loss": 0.1526, "step": 1090 }, { "epoch": 3.43, "grad_norm": 0.9604325890541077, "learning_rate": 9.657632398753894e-05, "loss": 0.0649, "step": 1100 }, { "epoch": 3.43, "eval_accuracy": 0.8366851595006934, "eval_f1": 0.8359690431594896, "eval_loss": 0.6827093958854675, "eval_precision": 0.8480904372623412, "eval_recall": 0.8366851595006934, "eval_runtime": 39.8316, "eval_samples_per_second": 72.405, "eval_steps_per_second": 9.063, "step": 1100 }, { "epoch": 3.46, "grad_norm": 0.6656555533409119, "learning_rate": 9.654517133956387e-05, "loss": 0.0866, "step": 1110 }, { "epoch": 3.49, "grad_norm": 1.8327091932296753, "learning_rate": 9.65140186915888e-05, "loss": 0.1117, "step": 1120 }, { "epoch": 3.52, "grad_norm": 6.639819145202637, "learning_rate": 9.648286604361371e-05, "loss": 0.0607, "step": 1130 }, { "epoch": 3.55, "grad_norm": 9.595551490783691, "learning_rate": 9.645171339563863e-05, "loss": 0.1603, "step": 1140 }, { "epoch": 3.58, "grad_norm": 4.852327346801758, "learning_rate": 9.642056074766356e-05, "loss": 0.1289, "step": 1150 }, { "epoch": 3.61, "grad_norm": 0.8141772150993347, "learning_rate": 9.638940809968848e-05, "loss": 0.2513, "step": 1160 }, { "epoch": 3.64, "grad_norm": 2.2528672218322754, "learning_rate": 9.63582554517134e-05, "loss": 0.1033, "step": 1170 }, { "epoch": 3.68, "grad_norm": 4.099298000335693, "learning_rate": 9.632710280373833e-05, "loss": 0.2042, "step": 1180 }, { "epoch": 3.71, "grad_norm": 2.307119131088257, "learning_rate": 9.629595015576324e-05, "loss": 0.0714, "step": 1190 }, { "epoch": 3.74, "grad_norm": 6.8174543380737305, "learning_rate": 9.626479750778816e-05, "loss": 0.1382, "step": 1200 }, { "epoch": 3.74, "eval_accuracy": 0.84500693481276, "eval_f1": 0.8398698444204201, "eval_loss": 0.6838334798812866, "eval_precision": 0.8466724060209081, "eval_recall": 0.84500693481276, "eval_runtime": 39.4271, "eval_samples_per_second": 73.148, "eval_steps_per_second": 9.156, "step": 1200 }, { "epoch": 3.77, "grad_norm": 3.4418959617614746, "learning_rate": 9.623364485981309e-05, "loss": 0.0921, "step": 1210 }, { "epoch": 3.8, "grad_norm": 3.6201603412628174, "learning_rate": 9.620249221183802e-05, "loss": 0.15, "step": 1220 }, { "epoch": 3.83, "grad_norm": 6.8857550621032715, "learning_rate": 9.617133956386293e-05, "loss": 0.0965, "step": 1230 }, { "epoch": 3.86, "grad_norm": 3.10553240776062, "learning_rate": 9.614018691588785e-05, "loss": 0.1674, "step": 1240 }, { "epoch": 3.89, "grad_norm": 9.131609916687012, "learning_rate": 9.610903426791278e-05, "loss": 0.1596, "step": 1250 }, { "epoch": 3.93, "grad_norm": 0.35134002566337585, "learning_rate": 9.607788161993771e-05, "loss": 0.1157, "step": 1260 }, { "epoch": 3.96, "grad_norm": 5.575935363769531, "learning_rate": 9.604672897196262e-05, "loss": 0.0822, "step": 1270 }, { "epoch": 3.99, "grad_norm": 0.6137746572494507, "learning_rate": 9.601557632398754e-05, "loss": 0.1316, "step": 1280 }, { "epoch": 4.02, "grad_norm": 0.34164953231811523, "learning_rate": 9.598442367601247e-05, "loss": 0.0739, "step": 1290 }, { "epoch": 4.05, "grad_norm": 4.076730728149414, "learning_rate": 9.595327102803738e-05, "loss": 0.0486, "step": 1300 }, { "epoch": 4.05, "eval_accuracy": 0.8578363384188626, "eval_f1": 0.8494463868744596, "eval_loss": 0.6367300748825073, "eval_precision": 0.8548000651229425, "eval_recall": 0.8578363384188626, "eval_runtime": 39.1728, "eval_samples_per_second": 73.623, "eval_steps_per_second": 9.216, "step": 1300 }, { "epoch": 4.08, "grad_norm": 0.4267037808895111, "learning_rate": 9.592211838006231e-05, "loss": 0.0499, "step": 1310 }, { "epoch": 4.11, "grad_norm": 8.932145118713379, "learning_rate": 9.589096573208724e-05, "loss": 0.058, "step": 1320 }, { "epoch": 4.14, "grad_norm": 7.81501579284668, "learning_rate": 9.585981308411214e-05, "loss": 0.0497, "step": 1330 }, { "epoch": 4.17, "grad_norm": 3.25376296043396, "learning_rate": 9.582866043613707e-05, "loss": 0.0613, "step": 1340 }, { "epoch": 4.21, "grad_norm": 0.009625586681067944, "learning_rate": 9.5797507788162e-05, "loss": 0.0882, "step": 1350 }, { "epoch": 4.24, "grad_norm": 8.644308090209961, "learning_rate": 9.576635514018693e-05, "loss": 0.0729, "step": 1360 }, { "epoch": 4.27, "grad_norm": 11.613913536071777, "learning_rate": 9.573520249221185e-05, "loss": 0.1285, "step": 1370 }, { "epoch": 4.3, "grad_norm": 0.9490543603897095, "learning_rate": 9.570404984423676e-05, "loss": 0.0408, "step": 1380 }, { "epoch": 4.33, "grad_norm": 2.557040214538574, "learning_rate": 9.567289719626169e-05, "loss": 0.0689, "step": 1390 }, { "epoch": 4.36, "grad_norm": 7.547731399536133, "learning_rate": 9.56417445482866e-05, "loss": 0.1122, "step": 1400 }, { "epoch": 4.36, "eval_accuracy": 0.8398058252427184, "eval_f1": 0.833035822565054, "eval_loss": 0.7330206036567688, "eval_precision": 0.836759139491613, "eval_recall": 0.8398058252427184, "eval_runtime": 39.5302, "eval_samples_per_second": 72.957, "eval_steps_per_second": 9.132, "step": 1400 }, { "epoch": 4.39, "grad_norm": 0.47195371985435486, "learning_rate": 9.561059190031153e-05, "loss": 0.0087, "step": 1410 }, { "epoch": 4.42, "grad_norm": 0.02496817521750927, "learning_rate": 9.557943925233645e-05, "loss": 0.0678, "step": 1420 }, { "epoch": 4.45, "grad_norm": 0.044717635959386826, "learning_rate": 9.554828660436137e-05, "loss": 0.0409, "step": 1430 }, { "epoch": 4.49, "grad_norm": 2.304049015045166, "learning_rate": 9.55171339563863e-05, "loss": 0.0661, "step": 1440 }, { "epoch": 4.52, "grad_norm": 11.10191822052002, "learning_rate": 9.548598130841122e-05, "loss": 0.3254, "step": 1450 }, { "epoch": 4.55, "grad_norm": 0.0031379794236272573, "learning_rate": 9.545482866043615e-05, "loss": 0.0655, "step": 1460 }, { "epoch": 4.58, "grad_norm": 1.050758719444275, "learning_rate": 9.542367601246105e-05, "loss": 0.0968, "step": 1470 }, { "epoch": 4.61, "grad_norm": 0.027871431782841682, "learning_rate": 9.539252336448598e-05, "loss": 0.1033, "step": 1480 }, { "epoch": 4.64, "grad_norm": 0.054837290197610855, "learning_rate": 9.536137071651091e-05, "loss": 0.0225, "step": 1490 }, { "epoch": 4.67, "grad_norm": 5.67630672454834, "learning_rate": 9.533021806853583e-05, "loss": 0.0302, "step": 1500 }, { "epoch": 4.67, "eval_accuracy": 0.84500693481276, "eval_f1": 0.8442384441117506, "eval_loss": 0.7136919498443604, "eval_precision": 0.8469740143199302, "eval_recall": 0.84500693481276, "eval_runtime": 39.1304, "eval_samples_per_second": 73.702, "eval_steps_per_second": 9.226, "step": 1500 }, { "epoch": 4.7, "grad_norm": 0.295539915561676, "learning_rate": 9.529906542056076e-05, "loss": 0.1178, "step": 1510 }, { "epoch": 4.74, "grad_norm": 0.0796700268983841, "learning_rate": 9.526791277258567e-05, "loss": 0.0481, "step": 1520 }, { "epoch": 4.77, "grad_norm": 0.1068115308880806, "learning_rate": 9.523676012461059e-05, "loss": 0.0282, "step": 1530 }, { "epoch": 4.8, "grad_norm": 1.0221561193466187, "learning_rate": 9.520560747663552e-05, "loss": 0.0538, "step": 1540 }, { "epoch": 4.83, "grad_norm": 7.369207859039307, "learning_rate": 9.517445482866045e-05, "loss": 0.1239, "step": 1550 }, { "epoch": 4.86, "grad_norm": 9.008218765258789, "learning_rate": 9.514330218068536e-05, "loss": 0.055, "step": 1560 }, { "epoch": 4.89, "grad_norm": 3.585855722427368, "learning_rate": 9.511214953271028e-05, "loss": 0.03, "step": 1570 }, { "epoch": 4.92, "grad_norm": 0.15154320001602173, "learning_rate": 9.50809968847352e-05, "loss": 0.0168, "step": 1580 }, { "epoch": 4.95, "grad_norm": 0.030903339385986328, "learning_rate": 9.504984423676014e-05, "loss": 0.1067, "step": 1590 }, { "epoch": 4.98, "grad_norm": 0.4652014672756195, "learning_rate": 9.501869158878505e-05, "loss": 0.0462, "step": 1600 }, { "epoch": 4.98, "eval_accuracy": 0.8515950069348127, "eval_f1": 0.8455611718307666, "eval_loss": 0.8198381066322327, "eval_precision": 0.8519412050125947, "eval_recall": 0.8515950069348127, "eval_runtime": 39.6856, "eval_samples_per_second": 72.671, "eval_steps_per_second": 9.096, "step": 1600 }, { "epoch": 5.02, "grad_norm": 14.000198364257812, "learning_rate": 9.498753894080997e-05, "loss": 0.0785, "step": 1610 }, { "epoch": 5.05, "grad_norm": 0.21171867847442627, "learning_rate": 9.49563862928349e-05, "loss": 0.019, "step": 1620 }, { "epoch": 5.08, "grad_norm": 0.004491983912885189, "learning_rate": 9.492523364485981e-05, "loss": 0.0111, "step": 1630 }, { "epoch": 5.11, "grad_norm": 0.016514340415596962, "learning_rate": 9.489408099688474e-05, "loss": 0.0897, "step": 1640 }, { "epoch": 5.14, "grad_norm": 8.40817928314209, "learning_rate": 9.486292834890967e-05, "loss": 0.0798, "step": 1650 }, { "epoch": 5.17, "grad_norm": 0.07949113100767136, "learning_rate": 9.483177570093458e-05, "loss": 0.0216, "step": 1660 }, { "epoch": 5.2, "grad_norm": 4.111806869506836, "learning_rate": 9.48006230529595e-05, "loss": 0.03, "step": 1670 }, { "epoch": 5.23, "grad_norm": 0.036615125834941864, "learning_rate": 9.476947040498443e-05, "loss": 0.0115, "step": 1680 }, { "epoch": 5.26, "grad_norm": 0.11661379039287567, "learning_rate": 9.473831775700936e-05, "loss": 0.01, "step": 1690 }, { "epoch": 5.3, "grad_norm": 0.16430974006652832, "learning_rate": 9.470716510903427e-05, "loss": 0.0109, "step": 1700 }, { "epoch": 5.3, "eval_accuracy": 0.8477808599167822, "eval_f1": 0.8378487316868508, "eval_loss": 0.8481851816177368, "eval_precision": 0.8383786998144745, "eval_recall": 0.8477808599167822, "eval_runtime": 39.3442, "eval_samples_per_second": 73.302, "eval_steps_per_second": 9.175, "step": 1700 }, { "epoch": 5.33, "grad_norm": 0.005747305229306221, "learning_rate": 9.467601246105919e-05, "loss": 0.0333, "step": 1710 }, { "epoch": 5.36, "grad_norm": 5.644034385681152, "learning_rate": 9.464485981308412e-05, "loss": 0.0148, "step": 1720 }, { "epoch": 5.39, "grad_norm": 0.002410849556326866, "learning_rate": 9.461370716510903e-05, "loss": 0.0518, "step": 1730 }, { "epoch": 5.42, "grad_norm": 0.002693226793780923, "learning_rate": 9.458255451713396e-05, "loss": 0.0508, "step": 1740 }, { "epoch": 5.45, "grad_norm": 10.301042556762695, "learning_rate": 9.455140186915888e-05, "loss": 0.071, "step": 1750 }, { "epoch": 5.48, "grad_norm": 0.06220326945185661, "learning_rate": 9.452024922118381e-05, "loss": 0.0568, "step": 1760 }, { "epoch": 5.51, "grad_norm": 0.07617553323507309, "learning_rate": 9.448909657320872e-05, "loss": 0.078, "step": 1770 }, { "epoch": 5.55, "grad_norm": 0.08313547819852829, "learning_rate": 9.445794392523365e-05, "loss": 0.0483, "step": 1780 }, { "epoch": 5.58, "grad_norm": 5.813291072845459, "learning_rate": 9.442679127725858e-05, "loss": 0.0369, "step": 1790 }, { "epoch": 5.61, "grad_norm": 6.673477649688721, "learning_rate": 9.43956386292835e-05, "loss": 0.0545, "step": 1800 }, { "epoch": 5.61, "eval_accuracy": 0.8498613037447988, "eval_f1": 0.8506454763787459, "eval_loss": 0.8046442270278931, "eval_precision": 0.8546625043916174, "eval_recall": 0.8498613037447988, "eval_runtime": 39.6817, "eval_samples_per_second": 72.678, "eval_steps_per_second": 9.097, "step": 1800 }, { "epoch": 5.61, "step": 1800, "total_flos": 2.2287694956200755e+18, "train_loss": 0.26933162180913817, "train_runtime": 1430.4649, "train_samples_per_second": 358.485, "train_steps_per_second": 22.44 } ], "logging_steps": 10, "max_steps": 32100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 100, "total_flos": 2.2287694956200755e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }