wsashawn's picture
Upload folder using huggingface_hub
1cf8600 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 100,
"global_step": 1315,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 4.0682622539611835,
"learning_rate": 5e-06,
"loss": 4.2205,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 4.312709224854891,
"learning_rate": 1e-05,
"loss": 4.2858,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 3.6601980442252438,
"learning_rate": 1.5e-05,
"loss": 4.1264,
"step": 3
},
{
"epoch": 0.02,
"grad_norm": 3.812299442715205,
"learning_rate": 2e-05,
"loss": 4.302,
"step": 4
},
{
"epoch": 0.02,
"grad_norm": 3.116526134864302,
"learning_rate": 2.5e-05,
"loss": 4.1039,
"step": 5
},
{
"epoch": 0.02,
"grad_norm": 2.6960273710433897,
"learning_rate": 3e-05,
"loss": 3.7882,
"step": 6
},
{
"epoch": 0.03,
"grad_norm": 2.511832693580313,
"learning_rate": 3.5e-05,
"loss": 3.4971,
"step": 7
},
{
"epoch": 0.03,
"grad_norm": 1.9433322085763716,
"learning_rate": 4e-05,
"loss": 3.1992,
"step": 8
},
{
"epoch": 0.03,
"grad_norm": 1.6894476126655664,
"learning_rate": 4.5e-05,
"loss": 3.0814,
"step": 9
},
{
"epoch": 0.04,
"grad_norm": 1.073675333188434,
"learning_rate": 5e-05,
"loss": 2.8756,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 1.4627544436421924,
"learning_rate": 5.500000000000001e-05,
"loss": 2.7048,
"step": 11
},
{
"epoch": 0.05,
"grad_norm": 1.99973700742081,
"learning_rate": 6e-05,
"loss": 2.6456,
"step": 12
},
{
"epoch": 0.05,
"grad_norm": 1.6172270630421473,
"learning_rate": 6.500000000000001e-05,
"loss": 2.6635,
"step": 13
},
{
"epoch": 0.05,
"grad_norm": 1.1295069230502908,
"learning_rate": 7e-05,
"loss": 2.59,
"step": 14
},
{
"epoch": 0.06,
"grad_norm": 1.2649325235735724,
"learning_rate": 7.500000000000001e-05,
"loss": 2.44,
"step": 15
},
{
"epoch": 0.06,
"grad_norm": 1.1789342809576489,
"learning_rate": 8e-05,
"loss": 2.5202,
"step": 16
},
{
"epoch": 0.06,
"grad_norm": 0.7081838083903702,
"learning_rate": 8.5e-05,
"loss": 2.367,
"step": 17
},
{
"epoch": 0.07,
"grad_norm": 0.7108144063965985,
"learning_rate": 9e-05,
"loss": 2.4458,
"step": 18
},
{
"epoch": 0.07,
"grad_norm": 0.6609440232385762,
"learning_rate": 9.5e-05,
"loss": 2.3898,
"step": 19
},
{
"epoch": 0.08,
"grad_norm": 0.5625207480279918,
"learning_rate": 0.0001,
"loss": 2.3829,
"step": 20
},
{
"epoch": 0.08,
"grad_norm": 0.6335176424475211,
"learning_rate": 0.000105,
"loss": 2.3701,
"step": 21
},
{
"epoch": 0.08,
"grad_norm": 0.5478182407846386,
"learning_rate": 0.00011000000000000002,
"loss": 2.3466,
"step": 22
},
{
"epoch": 0.09,
"grad_norm": 0.5542241828994607,
"learning_rate": 0.00011499999999999999,
"loss": 2.3094,
"step": 23
},
{
"epoch": 0.09,
"grad_norm": 0.5391779694945925,
"learning_rate": 0.00012,
"loss": 2.2829,
"step": 24
},
{
"epoch": 0.1,
"grad_norm": 0.5560267338571905,
"learning_rate": 0.000125,
"loss": 2.3031,
"step": 25
},
{
"epoch": 0.1,
"grad_norm": 0.5899065231096706,
"learning_rate": 0.00013000000000000002,
"loss": 2.3218,
"step": 26
},
{
"epoch": 0.1,
"grad_norm": 0.5822360971814909,
"learning_rate": 0.00013500000000000003,
"loss": 2.2678,
"step": 27
},
{
"epoch": 0.11,
"grad_norm": 0.5713806277891299,
"learning_rate": 0.00014,
"loss": 2.2085,
"step": 28
},
{
"epoch": 0.11,
"grad_norm": 0.5778957318063205,
"learning_rate": 0.000145,
"loss": 2.1972,
"step": 29
},
{
"epoch": 0.11,
"grad_norm": 0.6239096985024343,
"learning_rate": 0.00015000000000000001,
"loss": 2.157,
"step": 30
},
{
"epoch": 0.12,
"grad_norm": 0.664987114969562,
"learning_rate": 0.000155,
"loss": 2.144,
"step": 31
},
{
"epoch": 0.12,
"grad_norm": 0.6602627781365719,
"learning_rate": 0.00016,
"loss": 2.1819,
"step": 32
},
{
"epoch": 0.13,
"grad_norm": 0.6953248375158279,
"learning_rate": 0.000165,
"loss": 2.0508,
"step": 33
},
{
"epoch": 0.13,
"grad_norm": 0.750259033408061,
"learning_rate": 0.00017,
"loss": 2.129,
"step": 34
},
{
"epoch": 0.13,
"grad_norm": 0.6643380168708766,
"learning_rate": 0.000175,
"loss": 2.014,
"step": 35
},
{
"epoch": 0.14,
"grad_norm": 0.8486382402556593,
"learning_rate": 0.00018,
"loss": 1.9433,
"step": 36
},
{
"epoch": 0.14,
"grad_norm": 0.7414001872340891,
"learning_rate": 0.00018500000000000002,
"loss": 2.0308,
"step": 37
},
{
"epoch": 0.14,
"grad_norm": 0.7295050222316729,
"learning_rate": 0.00019,
"loss": 1.9284,
"step": 38
},
{
"epoch": 0.15,
"grad_norm": 0.7235284135505695,
"learning_rate": 0.000195,
"loss": 1.8847,
"step": 39
},
{
"epoch": 0.15,
"grad_norm": 0.753068728806512,
"learning_rate": 0.0002,
"loss": 1.8865,
"step": 40
},
{
"epoch": 0.16,
"grad_norm": 0.7560315372185343,
"learning_rate": 0.00019999969643677332,
"loss": 1.9206,
"step": 41
},
{
"epoch": 0.16,
"grad_norm": 0.7536264695992508,
"learning_rate": 0.00019999878574893627,
"loss": 1.8679,
"step": 42
},
{
"epoch": 0.16,
"grad_norm": 0.7688437635155836,
"learning_rate": 0.0001999972679420179,
"loss": 1.8956,
"step": 43
},
{
"epoch": 0.17,
"grad_norm": 0.7269625532226869,
"learning_rate": 0.0001999951430252332,
"loss": 1.7802,
"step": 44
},
{
"epoch": 0.17,
"grad_norm": 0.7518593665603851,
"learning_rate": 0.00019999241101148306,
"loss": 1.8219,
"step": 45
},
{
"epoch": 0.17,
"grad_norm": 0.8030012654181582,
"learning_rate": 0.00019998907191735434,
"loss": 1.7884,
"step": 46
},
{
"epoch": 0.18,
"grad_norm": 0.7431537767433151,
"learning_rate": 0.00019998512576311953,
"loss": 1.7157,
"step": 47
},
{
"epoch": 0.18,
"grad_norm": 0.8789668942389129,
"learning_rate": 0.00019998057257273675,
"loss": 1.705,
"step": 48
},
{
"epoch": 0.19,
"grad_norm": 0.8821830829681667,
"learning_rate": 0.00019997541237384966,
"loss": 1.7197,
"step": 49
},
{
"epoch": 0.19,
"grad_norm": 0.7827120449376896,
"learning_rate": 0.0001999696451977872,
"loss": 1.744,
"step": 50
},
{
"epoch": 0.19,
"grad_norm": 0.7824589333224853,
"learning_rate": 0.00019996327107956333,
"loss": 1.5894,
"step": 51
},
{
"epoch": 0.2,
"grad_norm": 0.805173848765375,
"learning_rate": 0.00019995629005787713,
"loss": 1.764,
"step": 52
},
{
"epoch": 0.2,
"grad_norm": 0.8408625701287021,
"learning_rate": 0.00019994870217511217,
"loss": 1.7377,
"step": 53
},
{
"epoch": 0.21,
"grad_norm": 0.7546347988189869,
"learning_rate": 0.0001999405074773365,
"loss": 1.7158,
"step": 54
},
{
"epoch": 0.21,
"grad_norm": 0.821101064412719,
"learning_rate": 0.0001999317060143023,
"loss": 1.5648,
"step": 55
},
{
"epoch": 0.21,
"grad_norm": 0.7901726402638417,
"learning_rate": 0.00019992229783944557,
"loss": 1.5496,
"step": 56
},
{
"epoch": 0.22,
"grad_norm": 0.8488465994765656,
"learning_rate": 0.00019991228300988585,
"loss": 1.5493,
"step": 57
},
{
"epoch": 0.22,
"grad_norm": 0.8859226699933315,
"learning_rate": 0.0001999016615864258,
"loss": 1.4703,
"step": 58
},
{
"epoch": 0.22,
"grad_norm": 0.9572611655447181,
"learning_rate": 0.0001998904336335509,
"loss": 1.5522,
"step": 59
},
{
"epoch": 0.23,
"grad_norm": 0.8654728304283881,
"learning_rate": 0.00019987859921942903,
"loss": 1.4136,
"step": 60
},
{
"epoch": 0.23,
"grad_norm": 0.7903035834496105,
"learning_rate": 0.00019986615841591002,
"loss": 1.5446,
"step": 61
},
{
"epoch": 0.24,
"grad_norm": 0.8146733074382629,
"learning_rate": 0.0001998531112985253,
"loss": 1.4741,
"step": 62
},
{
"epoch": 0.24,
"grad_norm": 0.8150030052550084,
"learning_rate": 0.00019983945794648734,
"loss": 1.4877,
"step": 63
},
{
"epoch": 0.24,
"grad_norm": 0.8980109486617155,
"learning_rate": 0.00019982519844268933,
"loss": 1.4981,
"step": 64
},
{
"epoch": 0.25,
"grad_norm": 0.9375060640657445,
"learning_rate": 0.00019981033287370443,
"loss": 1.5115,
"step": 65
},
{
"epoch": 0.25,
"grad_norm": 0.8164746637962466,
"learning_rate": 0.00019979486132978545,
"loss": 1.4431,
"step": 66
},
{
"epoch": 0.25,
"grad_norm": 0.8458741113281569,
"learning_rate": 0.0001997787839048642,
"loss": 1.3408,
"step": 67
},
{
"epoch": 0.26,
"grad_norm": 0.8444471080538805,
"learning_rate": 0.00019976210069655104,
"loss": 1.3728,
"step": 68
},
{
"epoch": 0.26,
"grad_norm": 0.9081815525327502,
"learning_rate": 0.0001997448118061341,
"loss": 1.3964,
"step": 69
},
{
"epoch": 0.27,
"grad_norm": 0.9703756561572067,
"learning_rate": 0.00019972691733857883,
"loss": 1.2744,
"step": 70
},
{
"epoch": 0.27,
"grad_norm": 0.8300846731216797,
"learning_rate": 0.00019970841740252725,
"loss": 1.201,
"step": 71
},
{
"epoch": 0.27,
"grad_norm": 0.8970681814115683,
"learning_rate": 0.00019968931211029734,
"loss": 1.3049,
"step": 72
},
{
"epoch": 0.28,
"grad_norm": 0.9203036618648276,
"learning_rate": 0.00019966960157788248,
"loss": 1.3423,
"step": 73
},
{
"epoch": 0.28,
"grad_norm": 0.8554417095674692,
"learning_rate": 0.00019964928592495045,
"loss": 1.2076,
"step": 74
},
{
"epoch": 0.29,
"grad_norm": 0.8693522008551818,
"learning_rate": 0.00019962836527484296,
"loss": 1.2388,
"step": 75
},
{
"epoch": 0.29,
"grad_norm": 0.8955501483204824,
"learning_rate": 0.0001996068397545748,
"loss": 1.2769,
"step": 76
},
{
"epoch": 0.29,
"grad_norm": 0.8972853945681226,
"learning_rate": 0.00019958470949483318,
"loss": 1.2043,
"step": 77
},
{
"epoch": 0.3,
"grad_norm": 0.9200733943459781,
"learning_rate": 0.00019956197462997667,
"loss": 1.1127,
"step": 78
},
{
"epoch": 0.3,
"grad_norm": 1.046096681591468,
"learning_rate": 0.00019953863529803466,
"loss": 1.1452,
"step": 79
},
{
"epoch": 0.3,
"grad_norm": 1.0188934085587635,
"learning_rate": 0.00019951469164070646,
"loss": 1.1164,
"step": 80
},
{
"epoch": 0.31,
"grad_norm": 0.8178106254313157,
"learning_rate": 0.00019949014380336028,
"loss": 1.0532,
"step": 81
},
{
"epoch": 0.31,
"grad_norm": 0.7925775197670102,
"learning_rate": 0.00019946499193503262,
"loss": 1.0213,
"step": 82
},
{
"epoch": 0.32,
"grad_norm": 0.8643770007688552,
"learning_rate": 0.000199439236188427,
"loss": 1.204,
"step": 83
},
{
"epoch": 0.32,
"grad_norm": 0.8294076531102257,
"learning_rate": 0.0001994128767199135,
"loss": 1.1476,
"step": 84
},
{
"epoch": 0.32,
"grad_norm": 0.8800850098334678,
"learning_rate": 0.0001993859136895274,
"loss": 1.0514,
"step": 85
},
{
"epoch": 0.33,
"grad_norm": 0.9129898809803545,
"learning_rate": 0.0001993583472609683,
"loss": 1.0532,
"step": 86
},
{
"epoch": 0.33,
"grad_norm": 0.9139962309722894,
"learning_rate": 0.00019933017760159937,
"loss": 1.116,
"step": 87
},
{
"epoch": 0.33,
"grad_norm": 0.8049163381340136,
"learning_rate": 0.00019930140488244602,
"loss": 1.0375,
"step": 88
},
{
"epoch": 0.34,
"grad_norm": 0.7506103786806886,
"learning_rate": 0.0001992720292781951,
"loss": 0.9954,
"step": 89
},
{
"epoch": 0.34,
"grad_norm": 0.8494376985530913,
"learning_rate": 0.0001992420509671936,
"loss": 1.011,
"step": 90
},
{
"epoch": 0.35,
"grad_norm": 1.0838465138358377,
"learning_rate": 0.0001992114701314478,
"loss": 1.1547,
"step": 91
},
{
"epoch": 0.35,
"grad_norm": 0.7780322978386432,
"learning_rate": 0.00019918028695662207,
"loss": 0.8977,
"step": 92
},
{
"epoch": 0.35,
"grad_norm": 0.7445309005650187,
"learning_rate": 0.00019914850163203768,
"loss": 0.9499,
"step": 93
},
{
"epoch": 0.36,
"grad_norm": 0.75618752117177,
"learning_rate": 0.00019911611435067172,
"loss": 0.9058,
"step": 94
},
{
"epoch": 0.36,
"grad_norm": 0.922514343070264,
"learning_rate": 0.00019908312530915603,
"loss": 0.9812,
"step": 95
},
{
"epoch": 0.37,
"grad_norm": 0.8335423463861146,
"learning_rate": 0.00019904953470777575,
"loss": 0.9237,
"step": 96
},
{
"epoch": 0.37,
"grad_norm": 0.8569297364051901,
"learning_rate": 0.0001990153427504683,
"loss": 0.9101,
"step": 97
},
{
"epoch": 0.37,
"grad_norm": 0.8431577631707102,
"learning_rate": 0.00019898054964482214,
"loss": 0.8826,
"step": 98
},
{
"epoch": 0.38,
"grad_norm": 0.8393428609756466,
"learning_rate": 0.00019894515560207537,
"loss": 0.9449,
"step": 99
},
{
"epoch": 0.38,
"grad_norm": 0.8009262477457554,
"learning_rate": 0.0001989091608371146,
"loss": 0.8172,
"step": 100
},
{
"epoch": 0.38,
"eval_blimp_filtered_avg": 0.7153731343283583,
"eval_blimp_filtered_std": 0.004995278040149823,
"step": 100
},
{
"epoch": 0.38,
"eval_blimp_supplement_avg": 0.8275862068965517,
"eval_blimp_supplement_std": 0.016574057417324883,
"step": 100
},
{
"epoch": 0.38,
"eval_vqa_filtered_avg": 0.5,
"eval_vqa_filtered_std": 0.050251890762960605,
"step": 100
},
{
"epoch": 0.38,
"eval_winoground_filtered_avg": 0.64,
"eval_winoground_filtered_std": 0.048241815132442176,
"step": 100
},
{
"epoch": 0.38,
"grad_norm": 0.7731939394783305,
"learning_rate": 0.0001988725655684736,
"loss": 0.841,
"step": 101
},
{
"epoch": 0.39,
"grad_norm": 0.9253945491288401,
"learning_rate": 0.00019883537001833188,
"loss": 0.9227,
"step": 102
},
{
"epoch": 0.39,
"grad_norm": 0.9312582898914622,
"learning_rate": 0.0001987975744125135,
"loss": 0.9283,
"step": 103
},
{
"epoch": 0.4,
"grad_norm": 0.781149897014019,
"learning_rate": 0.00019875917898048558,
"loss": 0.7902,
"step": 104
},
{
"epoch": 0.4,
"grad_norm": 0.8132523662925432,
"learning_rate": 0.0001987201839553569,
"loss": 0.8595,
"step": 105
},
{
"epoch": 0.4,
"grad_norm": 0.8368435917049845,
"learning_rate": 0.00019868058957387663,
"loss": 0.8643,
"step": 106
},
{
"epoch": 0.41,
"grad_norm": 0.7460582173754553,
"learning_rate": 0.00019864039607643273,
"loss": 0.7484,
"step": 107
},
{
"epoch": 0.41,
"grad_norm": 0.7789594633508866,
"learning_rate": 0.0001985996037070505,
"loss": 0.7889,
"step": 108
},
{
"epoch": 0.41,
"grad_norm": 0.8428485721906823,
"learning_rate": 0.00019855821271339125,
"loss": 0.838,
"step": 109
},
{
"epoch": 0.42,
"grad_norm": 0.8694838248366092,
"learning_rate": 0.00019851622334675066,
"loss": 0.77,
"step": 110
},
{
"epoch": 0.42,
"grad_norm": 0.8994640080800599,
"learning_rate": 0.00019847363586205727,
"loss": 0.7702,
"step": 111
},
{
"epoch": 0.43,
"grad_norm": 0.9931271763497148,
"learning_rate": 0.00019843045051787096,
"loss": 0.8737,
"step": 112
},
{
"epoch": 0.43,
"grad_norm": 0.8186757528083024,
"learning_rate": 0.00019838666757638135,
"loss": 0.7276,
"step": 113
},
{
"epoch": 0.43,
"grad_norm": 0.7976246474968561,
"learning_rate": 0.0001983422873034063,
"loss": 0.7967,
"step": 114
},
{
"epoch": 0.44,
"grad_norm": 0.7725738117793295,
"learning_rate": 0.0001982973099683902,
"loss": 0.8214,
"step": 115
},
{
"epoch": 0.44,
"grad_norm": 0.7497131152895721,
"learning_rate": 0.00019825173584440232,
"loss": 0.7069,
"step": 116
},
{
"epoch": 0.44,
"grad_norm": 0.8096175366747367,
"learning_rate": 0.0001982055652081352,
"loss": 0.699,
"step": 117
},
{
"epoch": 0.45,
"grad_norm": 0.8131636775412501,
"learning_rate": 0.00019815879833990304,
"loss": 0.7479,
"step": 118
},
{
"epoch": 0.45,
"grad_norm": 0.8103259093372099,
"learning_rate": 0.00019811143552363983,
"loss": 0.7013,
"step": 119
},
{
"epoch": 0.46,
"grad_norm": 0.8280999953040724,
"learning_rate": 0.00019806347704689778,
"loss": 0.6887,
"step": 120
},
{
"epoch": 0.46,
"grad_norm": 0.8116880398114632,
"learning_rate": 0.00019801492320084546,
"loss": 0.7934,
"step": 121
},
{
"epoch": 0.46,
"grad_norm": 0.8303791216043742,
"learning_rate": 0.00019796577428026616,
"loss": 0.8011,
"step": 122
},
{
"epoch": 0.47,
"grad_norm": 0.7619123156871999,
"learning_rate": 0.00019791603058355595,
"loss": 0.7217,
"step": 123
},
{
"epoch": 0.47,
"grad_norm": 0.7733520932459683,
"learning_rate": 0.00019786569241272197,
"loss": 0.6981,
"step": 124
},
{
"epoch": 0.48,
"grad_norm": 0.7846699948014345,
"learning_rate": 0.00019781476007338058,
"loss": 0.7401,
"step": 125
},
{
"epoch": 0.48,
"grad_norm": 0.7963675285315092,
"learning_rate": 0.00019776323387475547,
"loss": 0.705,
"step": 126
},
{
"epoch": 0.48,
"grad_norm": 0.7856922504610907,
"learning_rate": 0.00019771111412967583,
"loss": 0.6999,
"step": 127
},
{
"epoch": 0.49,
"grad_norm": 0.831698101406348,
"learning_rate": 0.0001976584011545744,
"loss": 0.7021,
"step": 128
},
{
"epoch": 0.49,
"grad_norm": 0.8387219512744284,
"learning_rate": 0.00019760509526948566,
"loss": 0.7296,
"step": 129
},
{
"epoch": 0.49,
"grad_norm": 0.8945842296083665,
"learning_rate": 0.00019755119679804367,
"loss": 0.7529,
"step": 130
},
{
"epoch": 0.5,
"grad_norm": 0.7498731306068172,
"learning_rate": 0.00019749670606748033,
"loss": 0.6911,
"step": 131
},
{
"epoch": 0.5,
"grad_norm": 0.7247722560830141,
"learning_rate": 0.0001974416234086233,
"loss": 0.6402,
"step": 132
},
{
"epoch": 0.51,
"grad_norm": 0.7627410009794175,
"learning_rate": 0.00019738594915589397,
"loss": 0.6702,
"step": 133
},
{
"epoch": 0.51,
"grad_norm": 0.7772008131178036,
"learning_rate": 0.00019732968364730545,
"loss": 0.6963,
"step": 134
},
{
"epoch": 0.51,
"grad_norm": 0.8275796285376315,
"learning_rate": 0.00019727282722446047,
"loss": 0.5929,
"step": 135
},
{
"epoch": 0.52,
"grad_norm": 0.7831120443420706,
"learning_rate": 0.0001972153802325495,
"loss": 0.6179,
"step": 136
},
{
"epoch": 0.52,
"grad_norm": 0.8292277458348148,
"learning_rate": 0.0001971573430203484,
"loss": 0.6375,
"step": 137
},
{
"epoch": 0.52,
"grad_norm": 0.8340476959630893,
"learning_rate": 0.00019709871594021642,
"loss": 0.6766,
"step": 138
},
{
"epoch": 0.53,
"grad_norm": 0.7994798406256604,
"learning_rate": 0.00019703949934809408,
"loss": 0.6239,
"step": 139
},
{
"epoch": 0.53,
"grad_norm": 0.7425672818805296,
"learning_rate": 0.00019697969360350098,
"loss": 0.6334,
"step": 140
},
{
"epoch": 0.54,
"grad_norm": 0.7443030221681443,
"learning_rate": 0.00019691929906953356,
"loss": 0.6278,
"step": 141
},
{
"epoch": 0.54,
"grad_norm": 0.7858836231557178,
"learning_rate": 0.0001968583161128631,
"loss": 0.6494,
"step": 142
},
{
"epoch": 0.54,
"grad_norm": 0.7748084569986792,
"learning_rate": 0.00019679674510373325,
"loss": 0.6408,
"step": 143
},
{
"epoch": 0.55,
"grad_norm": 0.7352594538958535,
"learning_rate": 0.00019673458641595784,
"loss": 0.54,
"step": 144
},
{
"epoch": 0.55,
"grad_norm": 0.9565541632170729,
"learning_rate": 0.00019667184042691875,
"loss": 0.6669,
"step": 145
},
{
"epoch": 0.56,
"grad_norm": 0.8159705350255112,
"learning_rate": 0.00019660850751756348,
"loss": 0.549,
"step": 146
},
{
"epoch": 0.56,
"grad_norm": 0.853659410491719,
"learning_rate": 0.00019654458807240283,
"loss": 0.703,
"step": 147
},
{
"epoch": 0.56,
"grad_norm": 0.7233148520384484,
"learning_rate": 0.0001964800824795087,
"loss": 0.5607,
"step": 148
},
{
"epoch": 0.57,
"grad_norm": 0.8017084720492917,
"learning_rate": 0.00019641499113051157,
"loss": 0.6522,
"step": 149
},
{
"epoch": 0.57,
"grad_norm": 0.6987528025960802,
"learning_rate": 0.00019634931442059832,
"loss": 0.5149,
"step": 150
},
{
"epoch": 0.57,
"grad_norm": 0.8039264184533087,
"learning_rate": 0.00019628305274850956,
"loss": 0.7189,
"step": 151
},
{
"epoch": 0.58,
"grad_norm": 0.795017894232834,
"learning_rate": 0.00019621620651653744,
"loss": 0.6096,
"step": 152
},
{
"epoch": 0.58,
"grad_norm": 0.7625532547573897,
"learning_rate": 0.00019614877613052312,
"loss": 0.583,
"step": 153
},
{
"epoch": 0.59,
"grad_norm": 0.8479800597198418,
"learning_rate": 0.00019608076199985433,
"loss": 0.5841,
"step": 154
},
{
"epoch": 0.59,
"grad_norm": 0.7984666488345297,
"learning_rate": 0.00019601216453746283,
"loss": 0.617,
"step": 155
},
{
"epoch": 0.59,
"grad_norm": 0.7291592602065788,
"learning_rate": 0.00019594298415982194,
"loss": 0.5409,
"step": 156
},
{
"epoch": 0.6,
"grad_norm": 0.7444160049758751,
"learning_rate": 0.0001958732212869441,
"loss": 0.5235,
"step": 157
},
{
"epoch": 0.6,
"grad_norm": 0.7698434281876576,
"learning_rate": 0.00019580287634237808,
"loss": 0.518,
"step": 158
},
{
"epoch": 0.6,
"grad_norm": 0.658666402238984,
"learning_rate": 0.00019573194975320673,
"loss": 0.4665,
"step": 159
},
{
"epoch": 0.61,
"grad_norm": 0.9088095098968566,
"learning_rate": 0.0001956604419500441,
"loss": 0.719,
"step": 160
},
{
"epoch": 0.61,
"grad_norm": 0.6790919494296814,
"learning_rate": 0.00019558835336703294,
"loss": 0.5317,
"step": 161
},
{
"epoch": 0.62,
"grad_norm": 0.7461814218746576,
"learning_rate": 0.00019551568444184215,
"loss": 0.5383,
"step": 162
},
{
"epoch": 0.62,
"grad_norm": 0.750432430440515,
"learning_rate": 0.00019544243561566403,
"loss": 0.5389,
"step": 163
},
{
"epoch": 0.62,
"grad_norm": 0.7472033806009103,
"learning_rate": 0.00019536860733321152,
"loss": 0.5066,
"step": 164
},
{
"epoch": 0.63,
"grad_norm": 0.7137901167042525,
"learning_rate": 0.00019529420004271567,
"loss": 0.4392,
"step": 165
},
{
"epoch": 0.63,
"grad_norm": 0.8491528440848178,
"learning_rate": 0.00019521921419592283,
"loss": 0.4803,
"step": 166
},
{
"epoch": 0.63,
"grad_norm": 0.888429079441991,
"learning_rate": 0.0001951436502480919,
"loss": 0.4945,
"step": 167
},
{
"epoch": 0.64,
"grad_norm": 0.7568924214482684,
"learning_rate": 0.00019506750865799162,
"loss": 0.5297,
"step": 168
},
{
"epoch": 0.64,
"grad_norm": 0.772439312971248,
"learning_rate": 0.0001949907898878977,
"loss": 0.5506,
"step": 169
},
{
"epoch": 0.65,
"grad_norm": 0.7127854034612457,
"learning_rate": 0.00019491349440359015,
"loss": 0.5152,
"step": 170
},
{
"epoch": 0.65,
"grad_norm": 0.6775965473007823,
"learning_rate": 0.00019483562267435018,
"loss": 0.4436,
"step": 171
},
{
"epoch": 0.65,
"grad_norm": 0.7085484218846827,
"learning_rate": 0.00019475717517295778,
"loss": 0.4726,
"step": 172
},
{
"epoch": 0.66,
"grad_norm": 0.6960992654258619,
"learning_rate": 0.00019467815237568842,
"loss": 0.4145,
"step": 173
},
{
"epoch": 0.66,
"grad_norm": 0.7739939422897306,
"learning_rate": 0.00019459855476231043,
"loss": 0.5368,
"step": 174
},
{
"epoch": 0.67,
"grad_norm": 0.7248885499578243,
"learning_rate": 0.00019451838281608197,
"loss": 0.4216,
"step": 175
},
{
"epoch": 0.67,
"grad_norm": 0.624271177758561,
"learning_rate": 0.00019443763702374812,
"loss": 0.3793,
"step": 176
},
{
"epoch": 0.67,
"grad_norm": 0.6547069254648431,
"learning_rate": 0.00019435631787553795,
"loss": 0.3958,
"step": 177
},
{
"epoch": 0.68,
"grad_norm": 0.7974662713630617,
"learning_rate": 0.00019427442586516155,
"loss": 0.4547,
"step": 178
},
{
"epoch": 0.68,
"grad_norm": 0.6979645264553271,
"learning_rate": 0.00019419196148980693,
"loss": 0.4269,
"step": 179
},
{
"epoch": 0.68,
"grad_norm": 0.7406062144284534,
"learning_rate": 0.0001941089252501372,
"loss": 0.4589,
"step": 180
},
{
"epoch": 0.69,
"grad_norm": 0.7721441112785269,
"learning_rate": 0.00019402531765028722,
"loss": 0.5091,
"step": 181
},
{
"epoch": 0.69,
"grad_norm": 0.6823443706634189,
"learning_rate": 0.00019394113919786094,
"loss": 0.4578,
"step": 182
},
{
"epoch": 0.7,
"grad_norm": 0.6296709496710926,
"learning_rate": 0.00019385639040392803,
"loss": 0.4192,
"step": 183
},
{
"epoch": 0.7,
"grad_norm": 0.7391260784858245,
"learning_rate": 0.00019377107178302074,
"loss": 0.5101,
"step": 184
},
{
"epoch": 0.7,
"grad_norm": 0.7622753386981794,
"learning_rate": 0.00019368518385313107,
"loss": 0.5035,
"step": 185
},
{
"epoch": 0.71,
"grad_norm": 0.7760535426376107,
"learning_rate": 0.00019359872713570732,
"loss": 0.4588,
"step": 186
},
{
"epoch": 0.71,
"grad_norm": 0.7543376381267045,
"learning_rate": 0.00019351170215565114,
"loss": 0.4783,
"step": 187
},
{
"epoch": 0.71,
"grad_norm": 0.761968062534956,
"learning_rate": 0.00019342410944131415,
"loss": 0.4452,
"step": 188
},
{
"epoch": 0.72,
"grad_norm": 0.6064844487399527,
"learning_rate": 0.00019333594952449488,
"loss": 0.3948,
"step": 189
},
{
"epoch": 0.72,
"grad_norm": 0.663418856322293,
"learning_rate": 0.00019324722294043558,
"loss": 0.3929,
"step": 190
},
{
"epoch": 0.73,
"grad_norm": 0.7628743041097833,
"learning_rate": 0.00019315793022781877,
"loss": 0.4686,
"step": 191
},
{
"epoch": 0.73,
"grad_norm": 0.7308155097864294,
"learning_rate": 0.00019306807192876412,
"loss": 0.4433,
"step": 192
},
{
"epoch": 0.73,
"grad_norm": 0.6266943285552069,
"learning_rate": 0.00019297764858882514,
"loss": 0.3778,
"step": 193
},
{
"epoch": 0.74,
"grad_norm": 0.7304858297996167,
"learning_rate": 0.00019288666075698588,
"loss": 0.4559,
"step": 194
},
{
"epoch": 0.74,
"grad_norm": 0.793956243255512,
"learning_rate": 0.0001927951089856575,
"loss": 0.5042,
"step": 195
},
{
"epoch": 0.75,
"grad_norm": 0.7521517748378912,
"learning_rate": 0.00019270299383067498,
"loss": 0.4876,
"step": 196
},
{
"epoch": 0.75,
"grad_norm": 0.6541305777620361,
"learning_rate": 0.00019261031585129386,
"loss": 0.4332,
"step": 197
},
{
"epoch": 0.75,
"grad_norm": 0.7459198000292951,
"learning_rate": 0.0001925170756101867,
"loss": 0.4742,
"step": 198
},
{
"epoch": 0.76,
"grad_norm": 0.7351417487869903,
"learning_rate": 0.0001924232736734396,
"loss": 0.4613,
"step": 199
},
{
"epoch": 0.76,
"grad_norm": 0.6880337228260249,
"learning_rate": 0.00019232891061054895,
"loss": 0.4507,
"step": 200
},
{
"epoch": 0.76,
"eval_blimp_filtered_avg": 0.7174626865671642,
"eval_blimp_filtered_std": 0.004966500893623926,
"step": 200
},
{
"epoch": 0.76,
"eval_blimp_supplement_avg": 0.8448275862068966,
"eval_blimp_supplement_std": 0.015878951045947127,
"step": 200
},
{
"epoch": 0.76,
"eval_vqa_filtered_avg": 0.49,
"eval_vqa_filtered_std": 0.05024183937956912,
"step": 200
},
{
"epoch": 0.76,
"eval_winoground_filtered_avg": 0.68,
"eval_winoground_filtered_std": 0.046882617226215034,
"step": 200
},
{
"epoch": 0.76,
"grad_norm": 0.6676123005810946,
"learning_rate": 0.00019223398699441785,
"loss": 0.4456,
"step": 201
},
{
"epoch": 0.77,
"grad_norm": 0.6441892256930322,
"learning_rate": 0.00019213850340135276,
"loss": 0.3842,
"step": 202
},
{
"epoch": 0.77,
"grad_norm": 0.7664915489965373,
"learning_rate": 0.00019204246041105974,
"loss": 0.4655,
"step": 203
},
{
"epoch": 0.78,
"grad_norm": 0.7694952589016031,
"learning_rate": 0.0001919458586066412,
"loss": 0.4042,
"step": 204
},
{
"epoch": 0.78,
"grad_norm": 0.8094634626332351,
"learning_rate": 0.00019184869857459232,
"loss": 0.4796,
"step": 205
},
{
"epoch": 0.78,
"grad_norm": 0.7250896027775108,
"learning_rate": 0.00019175098090479727,
"loss": 0.4493,
"step": 206
},
{
"epoch": 0.79,
"grad_norm": 0.695912278074156,
"learning_rate": 0.00019165270619052595,
"loss": 0.3892,
"step": 207
},
{
"epoch": 0.79,
"grad_norm": 0.6437397715709255,
"learning_rate": 0.00019155387502843013,
"loss": 0.3551,
"step": 208
},
{
"epoch": 0.79,
"grad_norm": 0.6624223376721595,
"learning_rate": 0.00019145448801853989,
"loss": 0.3824,
"step": 209
},
{
"epoch": 0.8,
"grad_norm": 0.7132818106598392,
"learning_rate": 0.0001913545457642601,
"loss": 0.443,
"step": 210
},
{
"epoch": 0.8,
"grad_norm": 0.5835412605472261,
"learning_rate": 0.00019125404887236663,
"loss": 0.2834,
"step": 211
},
{
"epoch": 0.81,
"grad_norm": 0.7082542269732592,
"learning_rate": 0.00019115299795300267,
"loss": 0.3691,
"step": 212
},
{
"epoch": 0.81,
"grad_norm": 0.8422813525765561,
"learning_rate": 0.00019105139361967507,
"loss": 0.4764,
"step": 213
},
{
"epoch": 0.81,
"grad_norm": 0.827672240696612,
"learning_rate": 0.00019094923648925067,
"loss": 0.4577,
"step": 214
},
{
"epoch": 0.82,
"grad_norm": 0.6311700380194617,
"learning_rate": 0.00019084652718195238,
"loss": 0.3641,
"step": 215
},
{
"epoch": 0.82,
"grad_norm": 0.6733244279961911,
"learning_rate": 0.00019074326632135562,
"loss": 0.3914,
"step": 216
},
{
"epoch": 0.83,
"grad_norm": 0.688661851686969,
"learning_rate": 0.00019063945453438432,
"loss": 0.4429,
"step": 217
},
{
"epoch": 0.83,
"grad_norm": 0.593793920110184,
"learning_rate": 0.0001905350924513074,
"loss": 0.3195,
"step": 218
},
{
"epoch": 0.83,
"grad_norm": 0.6401900884414997,
"learning_rate": 0.0001904301807057346,
"loss": 0.3838,
"step": 219
},
{
"epoch": 0.84,
"grad_norm": 0.6222158517510352,
"learning_rate": 0.0001903247199346129,
"loss": 0.3656,
"step": 220
},
{
"epoch": 0.84,
"grad_norm": 0.6346751167318555,
"learning_rate": 0.00019021871077822255,
"loss": 0.3423,
"step": 221
},
{
"epoch": 0.84,
"grad_norm": 0.6573146165491399,
"learning_rate": 0.00019011215388017316,
"loss": 0.3611,
"step": 222
},
{
"epoch": 0.85,
"grad_norm": 0.7506685575783819,
"learning_rate": 0.00019000504988739986,
"loss": 0.4136,
"step": 223
},
{
"epoch": 0.85,
"grad_norm": 0.7933264473843609,
"learning_rate": 0.00018989739945015933,
"loss": 0.4243,
"step": 224
},
{
"epoch": 0.86,
"grad_norm": 0.7056764465112211,
"learning_rate": 0.00018978920322202582,
"loss": 0.393,
"step": 225
},
{
"epoch": 0.86,
"grad_norm": 0.6440690477957951,
"learning_rate": 0.00018968046185988732,
"loss": 0.3339,
"step": 226
},
{
"epoch": 0.86,
"grad_norm": 0.7240338095005376,
"learning_rate": 0.0001895711760239413,
"loss": 0.4307,
"step": 227
},
{
"epoch": 0.87,
"grad_norm": 0.6482583513704759,
"learning_rate": 0.00018946134637769105,
"loss": 0.348,
"step": 228
},
{
"epoch": 0.87,
"grad_norm": 0.6314129612332641,
"learning_rate": 0.00018935097358794144,
"loss": 0.3594,
"step": 229
},
{
"epoch": 0.87,
"grad_norm": 0.6952900742432684,
"learning_rate": 0.00018924005832479478,
"loss": 0.4159,
"step": 230
},
{
"epoch": 0.88,
"grad_norm": 0.6973683572083686,
"learning_rate": 0.00018912860126164707,
"loss": 0.372,
"step": 231
},
{
"epoch": 0.88,
"grad_norm": 0.6228752985295244,
"learning_rate": 0.00018901660307518354,
"loss": 0.3182,
"step": 232
},
{
"epoch": 0.89,
"grad_norm": 0.6853492125246201,
"learning_rate": 0.00018890406444537486,
"loss": 0.3377,
"step": 233
},
{
"epoch": 0.89,
"grad_norm": 0.7047592541956417,
"learning_rate": 0.0001887909860554728,
"loss": 0.3938,
"step": 234
},
{
"epoch": 0.89,
"grad_norm": 0.7514455442163654,
"learning_rate": 0.0001886773685920062,
"loss": 0.4148,
"step": 235
},
{
"epoch": 0.9,
"grad_norm": 0.6032514516825037,
"learning_rate": 0.00018856321274477673,
"loss": 0.2853,
"step": 236
},
{
"epoch": 0.9,
"grad_norm": 0.6566649001522608,
"learning_rate": 0.0001884485192068547,
"loss": 0.333,
"step": 237
},
{
"epoch": 0.9,
"grad_norm": 0.5910387459158075,
"learning_rate": 0.00018833328867457497,
"loss": 0.2821,
"step": 238
},
{
"epoch": 0.91,
"grad_norm": 0.6135310255057351,
"learning_rate": 0.00018821752184753252,
"loss": 0.3528,
"step": 239
},
{
"epoch": 0.91,
"grad_norm": 0.6456839429891811,
"learning_rate": 0.00018810121942857845,
"loss": 0.3332,
"step": 240
},
{
"epoch": 0.92,
"grad_norm": 0.7053660522639559,
"learning_rate": 0.0001879843821238155,
"loss": 0.38,
"step": 241
},
{
"epoch": 0.92,
"grad_norm": 0.6185890160689334,
"learning_rate": 0.00018786701064259383,
"loss": 0.2801,
"step": 242
},
{
"epoch": 0.92,
"grad_norm": 0.7139325318031294,
"learning_rate": 0.00018774910569750673,
"loss": 0.3278,
"step": 243
},
{
"epoch": 0.93,
"grad_norm": 0.7529151224886904,
"learning_rate": 0.00018763066800438636,
"loss": 0.3163,
"step": 244
},
{
"epoch": 0.93,
"grad_norm": 0.7718980285573507,
"learning_rate": 0.00018751169828229927,
"loss": 0.3497,
"step": 245
},
{
"epoch": 0.94,
"grad_norm": 0.6881945450213135,
"learning_rate": 0.00018739219725354212,
"loss": 0.3393,
"step": 246
},
{
"epoch": 0.94,
"grad_norm": 0.6753691180440268,
"learning_rate": 0.00018727216564363723,
"loss": 0.3151,
"step": 247
},
{
"epoch": 0.94,
"grad_norm": 0.6015854239315479,
"learning_rate": 0.00018715160418132832,
"loss": 0.3239,
"step": 248
},
{
"epoch": 0.95,
"grad_norm": 0.5509912220014497,
"learning_rate": 0.00018703051359857586,
"loss": 0.2652,
"step": 249
},
{
"epoch": 0.95,
"grad_norm": 0.6581124878526092,
"learning_rate": 0.00018690889463055283,
"loss": 0.3287,
"step": 250
},
{
"epoch": 0.95,
"grad_norm": 0.6346317349050878,
"learning_rate": 0.0001867867480156402,
"loss": 0.2897,
"step": 251
},
{
"epoch": 0.96,
"grad_norm": 0.658112039759539,
"learning_rate": 0.00018666407449542232,
"loss": 0.2801,
"step": 252
},
{
"epoch": 0.96,
"grad_norm": 0.7964252151066129,
"learning_rate": 0.0001865408748146826,
"loss": 0.3569,
"step": 253
},
{
"epoch": 0.97,
"grad_norm": 0.7087027514563875,
"learning_rate": 0.0001864171497213989,
"loss": 0.319,
"step": 254
},
{
"epoch": 0.97,
"grad_norm": 0.5693751866666873,
"learning_rate": 0.00018629289996673897,
"loss": 0.296,
"step": 255
},
{
"epoch": 0.97,
"grad_norm": 0.4858712761873739,
"learning_rate": 0.00018616812630505597,
"loss": 0.2337,
"step": 256
},
{
"epoch": 0.98,
"grad_norm": 0.6605776129831655,
"learning_rate": 0.0001860428294938838,
"loss": 0.4049,
"step": 257
},
{
"epoch": 0.98,
"grad_norm": 0.5884323793209852,
"learning_rate": 0.00018591701029393255,
"loss": 0.2898,
"step": 258
},
{
"epoch": 0.98,
"grad_norm": 0.6234908079533132,
"learning_rate": 0.00018579066946908384,
"loss": 0.3411,
"step": 259
},
{
"epoch": 0.99,
"grad_norm": 0.5283455200292801,
"learning_rate": 0.00018566380778638628,
"loss": 0.2715,
"step": 260
},
{
"epoch": 0.99,
"grad_norm": 0.5257190836713216,
"learning_rate": 0.00018553642601605068,
"loss": 0.242,
"step": 261
},
{
"epoch": 1.0,
"grad_norm": 0.6940858344290759,
"learning_rate": 0.00018540852493144545,
"loss": 0.3354,
"step": 262
},
{
"epoch": 1.0,
"grad_norm": 0.6657062269261629,
"learning_rate": 0.00018528010530909192,
"loss": 0.3073,
"step": 263
},
{
"epoch": 1.0,
"grad_norm": 0.5838419940555095,
"learning_rate": 0.00018515116792865957,
"loss": 0.2293,
"step": 264
},
{
"epoch": 1.01,
"grad_norm": 0.6175240728549981,
"learning_rate": 0.00018502171357296144,
"loss": 0.2266,
"step": 265
},
{
"epoch": 1.01,
"grad_norm": 0.514362276581525,
"learning_rate": 0.00018489174302794905,
"loss": 0.2012,
"step": 266
},
{
"epoch": 1.02,
"grad_norm": 0.5495577560324101,
"learning_rate": 0.000184761257082708,
"loss": 0.1992,
"step": 267
},
{
"epoch": 1.02,
"grad_norm": 0.5803859857188628,
"learning_rate": 0.000184630256529453,
"loss": 0.2063,
"step": 268
},
{
"epoch": 1.02,
"grad_norm": 0.6033165262234346,
"learning_rate": 0.00018449874216352306,
"loss": 0.1708,
"step": 269
},
{
"epoch": 1.03,
"grad_norm": 0.6125263532068304,
"learning_rate": 0.00018436671478337666,
"loss": 0.1831,
"step": 270
},
{
"epoch": 1.03,
"grad_norm": 0.5508763917497557,
"learning_rate": 0.00018423417519058694,
"loss": 0.1907,
"step": 271
},
{
"epoch": 1.03,
"grad_norm": 0.6343768467745463,
"learning_rate": 0.0001841011241898369,
"loss": 0.2211,
"step": 272
},
{
"epoch": 1.04,
"grad_norm": 0.5592715854666213,
"learning_rate": 0.0001839675625889143,
"loss": 0.1835,
"step": 273
},
{
"epoch": 1.04,
"grad_norm": 0.6853195392487573,
"learning_rate": 0.00018383349119870695,
"loss": 0.2482,
"step": 274
},
{
"epoch": 1.05,
"grad_norm": 0.7078404397255201,
"learning_rate": 0.00018369891083319778,
"loss": 0.2346,
"step": 275
},
{
"epoch": 1.05,
"grad_norm": 0.5364140790780816,
"learning_rate": 0.00018356382230945976,
"loss": 0.2005,
"step": 276
},
{
"epoch": 1.05,
"grad_norm": 0.5699757158649345,
"learning_rate": 0.00018342822644765104,
"loss": 0.1952,
"step": 277
},
{
"epoch": 1.06,
"grad_norm": 0.533449428098461,
"learning_rate": 0.00018329212407100994,
"loss": 0.1827,
"step": 278
},
{
"epoch": 1.06,
"grad_norm": 0.5434329673812834,
"learning_rate": 0.00018315551600585009,
"loss": 0.1977,
"step": 279
},
{
"epoch": 1.06,
"grad_norm": 0.6086249429030556,
"learning_rate": 0.00018301840308155507,
"loss": 0.2075,
"step": 280
},
{
"epoch": 1.07,
"grad_norm": 0.5193552341089737,
"learning_rate": 0.0001828807861305738,
"loss": 0.1774,
"step": 281
},
{
"epoch": 1.07,
"grad_norm": 0.5899649673976222,
"learning_rate": 0.00018274266598841517,
"loss": 0.1935,
"step": 282
},
{
"epoch": 1.08,
"grad_norm": 0.6261684110299474,
"learning_rate": 0.0001826040434936431,
"loss": 0.1879,
"step": 283
},
{
"epoch": 1.08,
"grad_norm": 0.5177380809892513,
"learning_rate": 0.0001824649194878714,
"loss": 0.1605,
"step": 284
},
{
"epoch": 1.08,
"grad_norm": 0.6856627216594803,
"learning_rate": 0.00018232529481575872,
"loss": 0.1959,
"step": 285
},
{
"epoch": 1.09,
"grad_norm": 0.7111645847311474,
"learning_rate": 0.00018218517032500344,
"loss": 0.2339,
"step": 286
},
{
"epoch": 1.09,
"grad_norm": 0.6073029325620858,
"learning_rate": 0.00018204454686633834,
"loss": 0.1954,
"step": 287
},
{
"epoch": 1.1,
"grad_norm": 0.5935505620907925,
"learning_rate": 0.00018190342529352565,
"loss": 0.1935,
"step": 288
},
{
"epoch": 1.1,
"grad_norm": 0.6542152124825684,
"learning_rate": 0.0001817618064633518,
"loss": 0.2287,
"step": 289
},
{
"epoch": 1.1,
"grad_norm": 0.5147785285870337,
"learning_rate": 0.0001816196912356222,
"loss": 0.182,
"step": 290
},
{
"epoch": 1.11,
"grad_norm": 0.5727570935780749,
"learning_rate": 0.00018147708047315587,
"loss": 0.1815,
"step": 291
},
{
"epoch": 1.11,
"grad_norm": 0.5657627178730809,
"learning_rate": 0.00018133397504178057,
"loss": 0.2132,
"step": 292
},
{
"epoch": 1.11,
"grad_norm": 0.5662562121438198,
"learning_rate": 0.00018119037581032724,
"loss": 0.2056,
"step": 293
},
{
"epoch": 1.12,
"grad_norm": 0.6099744881610398,
"learning_rate": 0.00018104628365062477,
"loss": 0.213,
"step": 294
},
{
"epoch": 1.12,
"grad_norm": 0.5460632540608352,
"learning_rate": 0.00018090169943749476,
"loss": 0.162,
"step": 295
},
{
"epoch": 1.13,
"grad_norm": 0.5391352126605721,
"learning_rate": 0.00018075662404874626,
"loss": 0.1893,
"step": 296
},
{
"epoch": 1.13,
"grad_norm": 0.6494284594879935,
"learning_rate": 0.00018061105836517024,
"loss": 0.2291,
"step": 297
},
{
"epoch": 1.13,
"grad_norm": 0.5004570177715079,
"learning_rate": 0.00018046500327053463,
"loss": 0.1541,
"step": 298
},
{
"epoch": 1.14,
"grad_norm": 0.5918866837652731,
"learning_rate": 0.0001803184596515784,
"loss": 0.2188,
"step": 299
},
{
"epoch": 1.14,
"grad_norm": 0.4814928700454864,
"learning_rate": 0.00018017142839800668,
"loss": 0.1371,
"step": 300
},
{
"epoch": 1.14,
"eval_blimp_filtered_avg": 0.7256716417910448,
"eval_blimp_filtered_std": 0.004928318952407523,
"step": 300
},
{
"epoch": 1.14,
"eval_blimp_supplement_avg": 0.8405172413793104,
"eval_blimp_supplement_std": 0.016405077514349695,
"step": 300
},
{
"epoch": 1.14,
"eval_vqa_filtered_avg": 0.52,
"eval_vqa_filtered_std": 0.05021167315686779,
"step": 300
},
{
"epoch": 1.14,
"eval_winoground_filtered_avg": 0.65,
"eval_winoground_filtered_std": 0.047937248544110196,
"step": 300
},
{
"epoch": 1.14,
"grad_norm": 0.5140990035555452,
"learning_rate": 0.0001800239104024851,
"loss": 0.1533,
"step": 301
},
{
"epoch": 1.15,
"grad_norm": 0.4933080278392121,
"learning_rate": 0.0001798759065606345,
"loss": 0.1556,
"step": 302
},
{
"epoch": 1.15,
"grad_norm": 0.5882761858054979,
"learning_rate": 0.00017972741777102523,
"loss": 0.2151,
"step": 303
},
{
"epoch": 1.16,
"grad_norm": 0.5476770873594954,
"learning_rate": 0.00017957844493517213,
"loss": 0.1809,
"step": 304
},
{
"epoch": 1.16,
"grad_norm": 0.6421832624123264,
"learning_rate": 0.0001794289889575286,
"loss": 0.2046,
"step": 305
},
{
"epoch": 1.16,
"grad_norm": 0.5179683138411204,
"learning_rate": 0.0001792790507454815,
"loss": 0.176,
"step": 306
},
{
"epoch": 1.17,
"grad_norm": 0.604477016740788,
"learning_rate": 0.00017912863120934534,
"loss": 0.2174,
"step": 307
},
{
"epoch": 1.17,
"grad_norm": 0.5803547980215563,
"learning_rate": 0.00017897773126235688,
"loss": 0.1599,
"step": 308
},
{
"epoch": 1.17,
"grad_norm": 0.5391766451927411,
"learning_rate": 0.0001788263518206697,
"loss": 0.1867,
"step": 309
},
{
"epoch": 1.18,
"grad_norm": 0.5008335310625588,
"learning_rate": 0.00017867449380334834,
"loss": 0.1543,
"step": 310
},
{
"epoch": 1.18,
"grad_norm": 0.5344073359347221,
"learning_rate": 0.00017852215813236305,
"loss": 0.1836,
"step": 311
},
{
"epoch": 1.19,
"grad_norm": 0.5560594943415273,
"learning_rate": 0.000178369345732584,
"loss": 0.1878,
"step": 312
},
{
"epoch": 1.19,
"grad_norm": 0.4967512656076653,
"learning_rate": 0.00017821605753177562,
"loss": 0.1643,
"step": 313
},
{
"epoch": 1.19,
"grad_norm": 0.4616678862036335,
"learning_rate": 0.00017806229446059124,
"loss": 0.1405,
"step": 314
},
{
"epoch": 1.2,
"grad_norm": 0.5366458032488048,
"learning_rate": 0.00017790805745256704,
"loss": 0.1563,
"step": 315
},
{
"epoch": 1.2,
"grad_norm": 0.5850965548297408,
"learning_rate": 0.00017775334744411678,
"loss": 0.1735,
"step": 316
},
{
"epoch": 1.21,
"grad_norm": 0.6019045486967092,
"learning_rate": 0.00017759816537452574,
"loss": 0.1702,
"step": 317
},
{
"epoch": 1.21,
"grad_norm": 0.5447191325941722,
"learning_rate": 0.00017744251218594542,
"loss": 0.1536,
"step": 318
},
{
"epoch": 1.21,
"grad_norm": 0.5782504760607824,
"learning_rate": 0.00017728638882338746,
"loss": 0.1765,
"step": 319
},
{
"epoch": 1.22,
"grad_norm": 0.6057705101997692,
"learning_rate": 0.00017712979623471807,
"loss": 0.1771,
"step": 320
},
{
"epoch": 1.22,
"grad_norm": 0.6097931752718987,
"learning_rate": 0.00017697273537065232,
"loss": 0.2139,
"step": 321
},
{
"epoch": 1.22,
"grad_norm": 0.56308892592465,
"learning_rate": 0.00017681520718474823,
"loss": 0.1959,
"step": 322
},
{
"epoch": 1.23,
"grad_norm": 0.48304728057105417,
"learning_rate": 0.00017665721263340113,
"loss": 0.1541,
"step": 323
},
{
"epoch": 1.23,
"grad_norm": 0.5046672087816175,
"learning_rate": 0.0001764987526758377,
"loss": 0.2057,
"step": 324
},
{
"epoch": 1.24,
"grad_norm": 0.591812842303311,
"learning_rate": 0.00017633982827411032,
"loss": 0.2117,
"step": 325
},
{
"epoch": 1.24,
"grad_norm": 0.5124492509013987,
"learning_rate": 0.00017618044039309098,
"loss": 0.1782,
"step": 326
},
{
"epoch": 1.24,
"grad_norm": 0.49461512298625476,
"learning_rate": 0.0001760205900004657,
"loss": 0.1587,
"step": 327
},
{
"epoch": 1.25,
"grad_norm": 0.5072005711040857,
"learning_rate": 0.00017586027806672857,
"loss": 0.1551,
"step": 328
},
{
"epoch": 1.25,
"grad_norm": 0.7166883343661715,
"learning_rate": 0.00017569950556517566,
"loss": 0.2556,
"step": 329
},
{
"epoch": 1.25,
"grad_norm": 0.6256555159933375,
"learning_rate": 0.00017553827347189938,
"loss": 0.183,
"step": 330
},
{
"epoch": 1.26,
"grad_norm": 0.533828362038465,
"learning_rate": 0.00017537658276578247,
"loss": 0.1606,
"step": 331
},
{
"epoch": 1.26,
"grad_norm": 0.5661675325630594,
"learning_rate": 0.00017521443442849188,
"loss": 0.1789,
"step": 332
},
{
"epoch": 1.27,
"grad_norm": 0.5680894670630955,
"learning_rate": 0.00017505182944447316,
"loss": 0.1879,
"step": 333
},
{
"epoch": 1.27,
"grad_norm": 0.4817701078567901,
"learning_rate": 0.00017488876880094413,
"loss": 0.1521,
"step": 334
},
{
"epoch": 1.27,
"grad_norm": 0.4821470961031985,
"learning_rate": 0.0001747252534878891,
"loss": 0.1463,
"step": 335
},
{
"epoch": 1.28,
"grad_norm": 0.49328987347131154,
"learning_rate": 0.0001745612844980528,
"loss": 0.1717,
"step": 336
},
{
"epoch": 1.28,
"grad_norm": 0.5124773897350241,
"learning_rate": 0.00017439686282693436,
"loss": 0.1664,
"step": 337
},
{
"epoch": 1.29,
"grad_norm": 0.4947058789125862,
"learning_rate": 0.00017423198947278117,
"loss": 0.155,
"step": 338
},
{
"epoch": 1.29,
"grad_norm": 0.49669905069620973,
"learning_rate": 0.00017406666543658304,
"loss": 0.1317,
"step": 339
},
{
"epoch": 1.29,
"grad_norm": 0.5945246572487285,
"learning_rate": 0.00017390089172206592,
"loss": 0.179,
"step": 340
},
{
"epoch": 1.3,
"grad_norm": 0.6088690966027199,
"learning_rate": 0.00017373466933568588,
"loss": 0.1688,
"step": 341
},
{
"epoch": 1.3,
"grad_norm": 0.5504854313763279,
"learning_rate": 0.00017356799928662297,
"loss": 0.1409,
"step": 342
},
{
"epoch": 1.3,
"grad_norm": 0.5616999197359607,
"learning_rate": 0.00017340088258677522,
"loss": 0.1611,
"step": 343
},
{
"epoch": 1.31,
"grad_norm": 0.5630261456683686,
"learning_rate": 0.00017323332025075223,
"loss": 0.1424,
"step": 344
},
{
"epoch": 1.31,
"grad_norm": 0.5286011853132182,
"learning_rate": 0.00017306531329586933,
"loss": 0.1435,
"step": 345
},
{
"epoch": 1.32,
"grad_norm": 0.5614256677889038,
"learning_rate": 0.00017289686274214118,
"loss": 0.1717,
"step": 346
},
{
"epoch": 1.32,
"grad_norm": 0.5877414440813399,
"learning_rate": 0.00017272796961227563,
"loss": 0.15,
"step": 347
},
{
"epoch": 1.32,
"grad_norm": 0.5368624648164415,
"learning_rate": 0.00017255863493166756,
"loss": 0.1549,
"step": 348
},
{
"epoch": 1.33,
"grad_norm": 0.5500707727315131,
"learning_rate": 0.0001723888597283926,
"loss": 0.175,
"step": 349
},
{
"epoch": 1.33,
"grad_norm": 0.6833139983554208,
"learning_rate": 0.00017221864503320092,
"loss": 0.1678,
"step": 350
},
{
"epoch": 1.33,
"grad_norm": 0.5282683935361436,
"learning_rate": 0.00017204799187951105,
"loss": 0.1464,
"step": 351
},
{
"epoch": 1.34,
"grad_norm": 0.6220938804700495,
"learning_rate": 0.00017187690130340328,
"loss": 0.1936,
"step": 352
},
{
"epoch": 1.34,
"grad_norm": 0.5368424100980813,
"learning_rate": 0.00017170537434361386,
"loss": 0.16,
"step": 353
},
{
"epoch": 1.35,
"grad_norm": 0.4964367798749474,
"learning_rate": 0.0001715334120415283,
"loss": 0.1315,
"step": 354
},
{
"epoch": 1.35,
"grad_norm": 0.5332125576256961,
"learning_rate": 0.00017136101544117525,
"loss": 0.1601,
"step": 355
},
{
"epoch": 1.35,
"grad_norm": 0.6452241501826331,
"learning_rate": 0.00017118818558922003,
"loss": 0.1944,
"step": 356
},
{
"epoch": 1.36,
"grad_norm": 0.5478001238846428,
"learning_rate": 0.00017101492353495845,
"loss": 0.1628,
"step": 357
},
{
"epoch": 1.36,
"grad_norm": 0.5907872420921463,
"learning_rate": 0.00017084123033031024,
"loss": 0.1979,
"step": 358
},
{
"epoch": 1.37,
"grad_norm": 0.5539786918398002,
"learning_rate": 0.0001706671070298128,
"loss": 0.1569,
"step": 359
},
{
"epoch": 1.37,
"grad_norm": 0.504557687915929,
"learning_rate": 0.00017049255469061474,
"loss": 0.1676,
"step": 360
},
{
"epoch": 1.37,
"grad_norm": 0.5193398413523059,
"learning_rate": 0.00017031757437246947,
"loss": 0.1535,
"step": 361
},
{
"epoch": 1.38,
"grad_norm": 0.5089502238747983,
"learning_rate": 0.00017014216713772884,
"loss": 0.1609,
"step": 362
},
{
"epoch": 1.38,
"grad_norm": 0.5090619290397491,
"learning_rate": 0.00016996633405133655,
"loss": 0.1601,
"step": 363
},
{
"epoch": 1.38,
"grad_norm": 0.5294032162956256,
"learning_rate": 0.00016979007618082175,
"loss": 0.1416,
"step": 364
},
{
"epoch": 1.39,
"grad_norm": 0.5376905990718213,
"learning_rate": 0.0001696133945962927,
"loss": 0.1704,
"step": 365
},
{
"epoch": 1.39,
"grad_norm": 0.6016988384720686,
"learning_rate": 0.0001694362903704299,
"loss": 0.1724,
"step": 366
},
{
"epoch": 1.4,
"grad_norm": 0.6471404474003535,
"learning_rate": 0.00016925876457848,
"loss": 0.1893,
"step": 367
},
{
"epoch": 1.4,
"grad_norm": 0.5121222508453968,
"learning_rate": 0.00016908081829824912,
"loss": 0.1166,
"step": 368
},
{
"epoch": 1.4,
"grad_norm": 0.5997272693042293,
"learning_rate": 0.0001689024526100961,
"loss": 0.2107,
"step": 369
},
{
"epoch": 1.41,
"grad_norm": 0.5636921436981437,
"learning_rate": 0.00016872366859692627,
"loss": 0.1805,
"step": 370
},
{
"epoch": 1.41,
"grad_norm": 0.6126951187530087,
"learning_rate": 0.00016854446734418466,
"loss": 0.1914,
"step": 371
},
{
"epoch": 1.41,
"grad_norm": 0.554597312718397,
"learning_rate": 0.0001683648499398495,
"loss": 0.1836,
"step": 372
},
{
"epoch": 1.42,
"grad_norm": 0.5003434649301144,
"learning_rate": 0.00016818481747442554,
"loss": 0.1631,
"step": 373
},
{
"epoch": 1.42,
"grad_norm": 0.5181592112650633,
"learning_rate": 0.0001680043710409375,
"loss": 0.1323,
"step": 374
},
{
"epoch": 1.43,
"grad_norm": 0.5773215207200062,
"learning_rate": 0.00016782351173492342,
"loss": 0.2156,
"step": 375
},
{
"epoch": 1.43,
"grad_norm": 0.46149339744192175,
"learning_rate": 0.00016764224065442796,
"loss": 0.1308,
"step": 376
},
{
"epoch": 1.43,
"grad_norm": 0.5154009122145548,
"learning_rate": 0.0001674605588999959,
"loss": 0.1723,
"step": 377
},
{
"epoch": 1.44,
"grad_norm": 0.5032409831088436,
"learning_rate": 0.0001672784675746651,
"loss": 0.1522,
"step": 378
},
{
"epoch": 1.44,
"grad_norm": 0.5765338444046834,
"learning_rate": 0.00016709596778396026,
"loss": 0.1746,
"step": 379
},
{
"epoch": 1.44,
"grad_norm": 0.6731025984083968,
"learning_rate": 0.00016691306063588583,
"loss": 0.2205,
"step": 380
},
{
"epoch": 1.45,
"grad_norm": 0.4837466365127126,
"learning_rate": 0.00016672974724091954,
"loss": 0.1553,
"step": 381
},
{
"epoch": 1.45,
"grad_norm": 0.5424268673951455,
"learning_rate": 0.00016654602871200546,
"loss": 0.1433,
"step": 382
},
{
"epoch": 1.46,
"grad_norm": 0.5171465106257717,
"learning_rate": 0.0001663619061645474,
"loss": 0.1698,
"step": 383
},
{
"epoch": 1.46,
"grad_norm": 0.5543423821478913,
"learning_rate": 0.00016617738071640208,
"loss": 0.1494,
"step": 384
},
{
"epoch": 1.46,
"grad_norm": 0.5614170511875883,
"learning_rate": 0.0001659924534878723,
"loss": 0.1719,
"step": 385
},
{
"epoch": 1.47,
"grad_norm": 0.5817955693629092,
"learning_rate": 0.0001658071256017001,
"loss": 0.1653,
"step": 386
},
{
"epoch": 1.47,
"grad_norm": 0.5093939905856546,
"learning_rate": 0.0001656213981830602,
"loss": 0.1331,
"step": 387
},
{
"epoch": 1.48,
"grad_norm": 0.530369374719384,
"learning_rate": 0.00016543527235955282,
"loss": 0.1703,
"step": 388
},
{
"epoch": 1.48,
"grad_norm": 0.5540917573532339,
"learning_rate": 0.00016524874926119717,
"loss": 0.1801,
"step": 389
},
{
"epoch": 1.48,
"grad_norm": 0.5189763553195834,
"learning_rate": 0.0001650618300204242,
"loss": 0.1526,
"step": 390
},
{
"epoch": 1.49,
"grad_norm": 0.4919336835652238,
"learning_rate": 0.00016487451577207018,
"loss": 0.1251,
"step": 391
},
{
"epoch": 1.49,
"grad_norm": 0.5303366245761384,
"learning_rate": 0.00016468680765336936,
"loss": 0.1504,
"step": 392
},
{
"epoch": 1.49,
"grad_norm": 0.579006149147232,
"learning_rate": 0.00016449870680394747,
"loss": 0.2013,
"step": 393
},
{
"epoch": 1.5,
"grad_norm": 0.5698441995370652,
"learning_rate": 0.0001643102143658145,
"loss": 0.1494,
"step": 394
},
{
"epoch": 1.5,
"grad_norm": 0.45335914900667046,
"learning_rate": 0.00016412133148335784,
"loss": 0.1196,
"step": 395
},
{
"epoch": 1.51,
"grad_norm": 0.500799551817792,
"learning_rate": 0.0001639320593033355,
"loss": 0.1576,
"step": 396
},
{
"epoch": 1.51,
"grad_norm": 0.6003518072235265,
"learning_rate": 0.000163742398974869,
"loss": 0.1584,
"step": 397
},
{
"epoch": 1.51,
"grad_norm": 0.6168365552101142,
"learning_rate": 0.00016355235164943626,
"loss": 0.2091,
"step": 398
},
{
"epoch": 1.52,
"grad_norm": 0.5713474186129687,
"learning_rate": 0.0001633619184808649,
"loss": 0.1567,
"step": 399
},
{
"epoch": 1.52,
"grad_norm": 0.5874214731593885,
"learning_rate": 0.0001631711006253251,
"loss": 0.1574,
"step": 400
},
{
"epoch": 1.52,
"eval_blimp_filtered_avg": 0.7201492537313433,
"eval_blimp_filtered_std": 0.004978357158642791,
"step": 400
},
{
"epoch": 1.52,
"eval_blimp_supplement_avg": 0.8297413793103449,
"eval_blimp_supplement_std": 0.01657166693464671,
"step": 400
},
{
"epoch": 1.52,
"eval_vqa_filtered_avg": 0.49,
"eval_vqa_filtered_std": 0.05024183937956912,
"step": 400
},
{
"epoch": 1.52,
"eval_winoground_filtered_avg": 0.61,
"eval_winoground_filtered_std": 0.04902071300001975,
"step": 400
},
{
"epoch": 1.52,
"grad_norm": 0.6042802091257128,
"learning_rate": 0.00016297989924132252,
"loss": 0.1818,
"step": 401
},
{
"epoch": 1.53,
"grad_norm": 0.47364098207681,
"learning_rate": 0.00016278831548969134,
"loss": 0.1328,
"step": 402
},
{
"epoch": 1.53,
"grad_norm": 0.5471235998749361,
"learning_rate": 0.00016259635053358717,
"loss": 0.1507,
"step": 403
},
{
"epoch": 1.54,
"grad_norm": 0.5880254357081499,
"learning_rate": 0.00016240400553848007,
"loss": 0.1797,
"step": 404
},
{
"epoch": 1.54,
"grad_norm": 0.4946694660134096,
"learning_rate": 0.0001622112816721474,
"loss": 0.1445,
"step": 405
},
{
"epoch": 1.54,
"grad_norm": 0.5391138081584693,
"learning_rate": 0.0001620181801046667,
"loss": 0.1592,
"step": 406
},
{
"epoch": 1.55,
"grad_norm": 0.4600495974038577,
"learning_rate": 0.00016182470200840868,
"loss": 0.1255,
"step": 407
},
{
"epoch": 1.55,
"grad_norm": 0.4600858105564683,
"learning_rate": 0.00016163084855803006,
"loss": 0.1274,
"step": 408
},
{
"epoch": 1.56,
"grad_norm": 0.4869984031862606,
"learning_rate": 0.00016143662093046638,
"loss": 0.1312,
"step": 409
},
{
"epoch": 1.56,
"grad_norm": 0.5760667761271687,
"learning_rate": 0.000161242020304925,
"loss": 0.1956,
"step": 410
},
{
"epoch": 1.56,
"grad_norm": 0.4914141963984456,
"learning_rate": 0.0001610470478628778,
"loss": 0.1305,
"step": 411
},
{
"epoch": 1.57,
"grad_norm": 0.5844115229455148,
"learning_rate": 0.00016085170478805395,
"loss": 0.1767,
"step": 412
},
{
"epoch": 1.57,
"grad_norm": 0.5609976894365024,
"learning_rate": 0.00016065599226643303,
"loss": 0.1568,
"step": 413
},
{
"epoch": 1.57,
"grad_norm": 0.5199227181198156,
"learning_rate": 0.0001604599114862375,
"loss": 0.1358,
"step": 414
},
{
"epoch": 1.58,
"grad_norm": 0.580620607781089,
"learning_rate": 0.00016026346363792567,
"loss": 0.1568,
"step": 415
},
{
"epoch": 1.58,
"grad_norm": 0.5406448317458065,
"learning_rate": 0.00016006664991418434,
"loss": 0.1398,
"step": 416
},
{
"epoch": 1.59,
"grad_norm": 0.44523867426706976,
"learning_rate": 0.00015986947150992172,
"loss": 0.1355,
"step": 417
},
{
"epoch": 1.59,
"grad_norm": 0.5115419137782133,
"learning_rate": 0.0001596719296222601,
"loss": 0.1468,
"step": 418
},
{
"epoch": 1.59,
"grad_norm": 0.5985959567307599,
"learning_rate": 0.0001594740254505285,
"loss": 0.1378,
"step": 419
},
{
"epoch": 1.6,
"grad_norm": 0.6222671216404903,
"learning_rate": 0.0001592757601962555,
"loss": 0.184,
"step": 420
},
{
"epoch": 1.6,
"grad_norm": 0.5413250088547485,
"learning_rate": 0.00015907713506316192,
"loss": 0.1758,
"step": 421
},
{
"epoch": 1.6,
"grad_norm": 0.5382013654400807,
"learning_rate": 0.00015887815125715344,
"loss": 0.1509,
"step": 422
},
{
"epoch": 1.61,
"grad_norm": 0.48249026147532587,
"learning_rate": 0.00015867880998631347,
"loss": 0.1183,
"step": 423
},
{
"epoch": 1.61,
"grad_norm": 0.5223760532607825,
"learning_rate": 0.0001584791124608955,
"loss": 0.1446,
"step": 424
},
{
"epoch": 1.62,
"grad_norm": 0.42511278258760743,
"learning_rate": 0.0001582790598933161,
"loss": 0.1085,
"step": 425
},
{
"epoch": 1.62,
"grad_norm": 0.5280124307363728,
"learning_rate": 0.00015807865349814733,
"loss": 0.1211,
"step": 426
},
{
"epoch": 1.62,
"grad_norm": 0.44071272749150264,
"learning_rate": 0.00015787789449210938,
"loss": 0.1199,
"step": 427
},
{
"epoch": 1.63,
"grad_norm": 0.5672982836442205,
"learning_rate": 0.0001576767840940633,
"loss": 0.1653,
"step": 428
},
{
"epoch": 1.63,
"grad_norm": 0.5407137471822783,
"learning_rate": 0.00015747532352500357,
"loss": 0.1459,
"step": 429
},
{
"epoch": 1.63,
"grad_norm": 0.47958987114022433,
"learning_rate": 0.00015727351400805052,
"loss": 0.122,
"step": 430
},
{
"epoch": 1.64,
"grad_norm": 0.4949008797838833,
"learning_rate": 0.0001570713567684432,
"loss": 0.1476,
"step": 431
},
{
"epoch": 1.64,
"grad_norm": 0.5706163209211994,
"learning_rate": 0.0001568688530335316,
"loss": 0.1859,
"step": 432
},
{
"epoch": 1.65,
"grad_norm": 0.48156623541863475,
"learning_rate": 0.0001566660040327695,
"loss": 0.1354,
"step": 433
},
{
"epoch": 1.65,
"grad_norm": 0.48592626955333434,
"learning_rate": 0.00015646281099770682,
"loss": 0.1287,
"step": 434
},
{
"epoch": 1.65,
"grad_norm": 0.42812001499236796,
"learning_rate": 0.00015625927516198232,
"loss": 0.1041,
"step": 435
},
{
"epoch": 1.66,
"grad_norm": 0.5351459013098889,
"learning_rate": 0.0001560553977613158,
"loss": 0.1321,
"step": 436
},
{
"epoch": 1.66,
"grad_norm": 0.603892324255596,
"learning_rate": 0.00015585118003350092,
"loss": 0.1524,
"step": 437
},
{
"epoch": 1.67,
"grad_norm": 0.5704162659549111,
"learning_rate": 0.00015564662321839755,
"loss": 0.1643,
"step": 438
},
{
"epoch": 1.67,
"grad_norm": 0.5619464269808508,
"learning_rate": 0.00015544172855792423,
"loss": 0.1695,
"step": 439
},
{
"epoch": 1.67,
"grad_norm": 0.5552857752448269,
"learning_rate": 0.0001552364972960506,
"loss": 0.1436,
"step": 440
},
{
"epoch": 1.68,
"grad_norm": 0.4859084061375966,
"learning_rate": 0.00015503093067878996,
"loss": 0.1304,
"step": 441
},
{
"epoch": 1.68,
"grad_norm": 0.44977231534740475,
"learning_rate": 0.00015482502995419167,
"loss": 0.1328,
"step": 442
},
{
"epoch": 1.68,
"grad_norm": 0.4713213980355454,
"learning_rate": 0.0001546187963723334,
"loss": 0.1105,
"step": 443
},
{
"epoch": 1.69,
"grad_norm": 0.5341277409726155,
"learning_rate": 0.00015441223118531388,
"loss": 0.1449,
"step": 444
},
{
"epoch": 1.69,
"grad_norm": 0.5705390112616555,
"learning_rate": 0.00015420533564724495,
"loss": 0.1695,
"step": 445
},
{
"epoch": 1.7,
"grad_norm": 0.4774790901030609,
"learning_rate": 0.00015399811101424418,
"loss": 0.1403,
"step": 446
},
{
"epoch": 1.7,
"grad_norm": 0.5512200465464098,
"learning_rate": 0.00015379055854442708,
"loss": 0.1643,
"step": 447
},
{
"epoch": 1.7,
"grad_norm": 0.5219733540585905,
"learning_rate": 0.00015358267949789966,
"loss": 0.1237,
"step": 448
},
{
"epoch": 1.71,
"grad_norm": 0.5033089815712022,
"learning_rate": 0.0001533744751367506,
"loss": 0.1291,
"step": 449
},
{
"epoch": 1.71,
"grad_norm": 0.4747834365201173,
"learning_rate": 0.0001531659467250436,
"loss": 0.1275,
"step": 450
},
{
"epoch": 1.71,
"grad_norm": 0.40965665913605903,
"learning_rate": 0.0001529570955288099,
"loss": 0.1156,
"step": 451
},
{
"epoch": 1.72,
"grad_norm": 0.440005816758666,
"learning_rate": 0.00015274792281604028,
"loss": 0.1099,
"step": 452
},
{
"epoch": 1.72,
"grad_norm": 0.5579574679556536,
"learning_rate": 0.00015253842985667762,
"loss": 0.1456,
"step": 453
},
{
"epoch": 1.73,
"grad_norm": 0.4959791770798946,
"learning_rate": 0.0001523286179226091,
"loss": 0.1468,
"step": 454
},
{
"epoch": 1.73,
"grad_norm": 0.5066322640800116,
"learning_rate": 0.0001521184882876585,
"loss": 0.1085,
"step": 455
},
{
"epoch": 1.73,
"grad_norm": 0.5297332606909233,
"learning_rate": 0.00015190804222757845,
"loss": 0.1476,
"step": 456
},
{
"epoch": 1.74,
"grad_norm": 0.5830381345482916,
"learning_rate": 0.00015169728102004256,
"loss": 0.1424,
"step": 457
},
{
"epoch": 1.74,
"grad_norm": 0.44997991084618094,
"learning_rate": 0.00015148620594463794,
"loss": 0.1068,
"step": 458
},
{
"epoch": 1.75,
"grad_norm": 0.5459028855766525,
"learning_rate": 0.00015127481828285718,
"loss": 0.1673,
"step": 459
},
{
"epoch": 1.75,
"grad_norm": 0.46924015811236747,
"learning_rate": 0.0001510631193180907,
"loss": 0.1136,
"step": 460
},
{
"epoch": 1.75,
"grad_norm": 0.5011418751892371,
"learning_rate": 0.00015085111033561895,
"loss": 0.1154,
"step": 461
},
{
"epoch": 1.76,
"grad_norm": 0.5328628999011582,
"learning_rate": 0.00015063879262260446,
"loss": 0.1397,
"step": 462
},
{
"epoch": 1.76,
"grad_norm": 0.4964913122706592,
"learning_rate": 0.00015042616746808435,
"loss": 0.144,
"step": 463
},
{
"epoch": 1.76,
"grad_norm": 0.501470133961365,
"learning_rate": 0.00015021323616296213,
"loss": 0.1379,
"step": 464
},
{
"epoch": 1.77,
"grad_norm": 0.5269212199791002,
"learning_rate": 0.00015000000000000001,
"loss": 0.1311,
"step": 465
},
{
"epoch": 1.77,
"grad_norm": 0.5049669772855443,
"learning_rate": 0.00014978646027381123,
"loss": 0.1196,
"step": 466
},
{
"epoch": 1.78,
"grad_norm": 0.4525225552090921,
"learning_rate": 0.00014957261828085191,
"loss": 0.1141,
"step": 467
},
{
"epoch": 1.78,
"grad_norm": 0.4790059846117726,
"learning_rate": 0.0001493584753194134,
"loss": 0.1028,
"step": 468
},
{
"epoch": 1.78,
"grad_norm": 0.5832150044411881,
"learning_rate": 0.00014914403268961426,
"loss": 0.158,
"step": 469
},
{
"epoch": 1.79,
"grad_norm": 0.522112830999454,
"learning_rate": 0.00014892929169339235,
"loss": 0.1459,
"step": 470
},
{
"epoch": 1.79,
"grad_norm": 0.4887536825683872,
"learning_rate": 0.00014871425363449718,
"loss": 0.1263,
"step": 471
},
{
"epoch": 1.79,
"grad_norm": 0.467502684654505,
"learning_rate": 0.00014849891981848158,
"loss": 0.1388,
"step": 472
},
{
"epoch": 1.8,
"grad_norm": 0.4546537818819024,
"learning_rate": 0.0001482832915526942,
"loss": 0.105,
"step": 473
},
{
"epoch": 1.8,
"grad_norm": 0.541602974136478,
"learning_rate": 0.00014806737014627124,
"loss": 0.1388,
"step": 474
},
{
"epoch": 1.81,
"grad_norm": 0.5066261091776695,
"learning_rate": 0.00014785115691012864,
"loss": 0.1231,
"step": 475
},
{
"epoch": 1.81,
"grad_norm": 0.6064880991698398,
"learning_rate": 0.00014763465315695425,
"loss": 0.173,
"step": 476
},
{
"epoch": 1.81,
"grad_norm": 0.5259754097567955,
"learning_rate": 0.00014741786020119955,
"loss": 0.1383,
"step": 477
},
{
"epoch": 1.82,
"grad_norm": 0.5335061292186741,
"learning_rate": 0.00014720077935907196,
"loss": 0.1671,
"step": 478
},
{
"epoch": 1.82,
"grad_norm": 0.43421172131647107,
"learning_rate": 0.00014698341194852664,
"loss": 0.1207,
"step": 479
},
{
"epoch": 1.83,
"grad_norm": 0.4476149707237085,
"learning_rate": 0.00014676575928925867,
"loss": 0.1103,
"step": 480
},
{
"epoch": 1.83,
"grad_norm": 0.49116627439944716,
"learning_rate": 0.00014654782270269497,
"loss": 0.1195,
"step": 481
},
{
"epoch": 1.83,
"grad_norm": 0.44289326248169675,
"learning_rate": 0.00014632960351198618,
"loss": 0.1217,
"step": 482
},
{
"epoch": 1.84,
"grad_norm": 0.4022818703819969,
"learning_rate": 0.00014611110304199872,
"loss": 0.0868,
"step": 483
},
{
"epoch": 1.84,
"grad_norm": 0.43714254165230704,
"learning_rate": 0.00014589232261930674,
"loss": 0.1062,
"step": 484
},
{
"epoch": 1.84,
"grad_norm": 0.4499910364889525,
"learning_rate": 0.00014567326357218407,
"loss": 0.1115,
"step": 485
},
{
"epoch": 1.85,
"grad_norm": 0.4812264446699121,
"learning_rate": 0.00014545392723059616,
"loss": 0.1272,
"step": 486
},
{
"epoch": 1.85,
"grad_norm": 0.5466847826869639,
"learning_rate": 0.0001452343149261919,
"loss": 0.1387,
"step": 487
},
{
"epoch": 1.86,
"grad_norm": 0.4671789379009673,
"learning_rate": 0.00014501442799229572,
"loss": 0.1227,
"step": 488
},
{
"epoch": 1.86,
"grad_norm": 0.4168568601736311,
"learning_rate": 0.00014479426776389936,
"loss": 0.0955,
"step": 489
},
{
"epoch": 1.86,
"grad_norm": 0.4236347478067532,
"learning_rate": 0.00014457383557765386,
"loss": 0.1226,
"step": 490
},
{
"epoch": 1.87,
"grad_norm": 0.49129515437526694,
"learning_rate": 0.00014435313277186125,
"loss": 0.1411,
"step": 491
},
{
"epoch": 1.87,
"grad_norm": 0.4395945017572998,
"learning_rate": 0.00014413216068646668,
"loss": 0.1306,
"step": 492
},
{
"epoch": 1.87,
"grad_norm": 0.4832622577980266,
"learning_rate": 0.0001439109206630501,
"loss": 0.1133,
"step": 493
},
{
"epoch": 1.88,
"grad_norm": 0.4840123051282916,
"learning_rate": 0.0001436894140448183,
"loss": 0.1354,
"step": 494
},
{
"epoch": 1.88,
"grad_norm": 0.6106816446433545,
"learning_rate": 0.00014346764217659653,
"loss": 0.1759,
"step": 495
},
{
"epoch": 1.89,
"grad_norm": 0.4439633620623163,
"learning_rate": 0.0001432456064048204,
"loss": 0.105,
"step": 496
},
{
"epoch": 1.89,
"grad_norm": 0.6103065192992649,
"learning_rate": 0.00014302330807752786,
"loss": 0.1736,
"step": 497
},
{
"epoch": 1.89,
"grad_norm": 0.4594414148489128,
"learning_rate": 0.0001428007485443509,
"loss": 0.1067,
"step": 498
},
{
"epoch": 1.9,
"grad_norm": 0.48309766302252394,
"learning_rate": 0.00014257792915650728,
"loss": 0.1319,
"step": 499
},
{
"epoch": 1.9,
"grad_norm": 0.4979398119598126,
"learning_rate": 0.00014235485126679243,
"loss": 0.1361,
"step": 500
},
{
"epoch": 1.9,
"eval_blimp_filtered_avg": 0.7231343283582089,
"eval_blimp_filtered_std": 0.004963011033511667,
"step": 500
},
{
"epoch": 1.9,
"eval_blimp_supplement_avg": 0.8275862068965517,
"eval_blimp_supplement_std": 0.0167403676680407,
"step": 500
},
{
"epoch": 1.9,
"eval_vqa_filtered_avg": 0.47,
"eval_vqa_filtered_std": 0.0501613558046592,
"step": 500
},
{
"epoch": 1.9,
"eval_winoground_filtered_avg": 0.64,
"eval_winoground_filtered_std": 0.048241815132442176,
"step": 500
},
{
"epoch": 1.9,
"grad_norm": 0.4877241025128464,
"learning_rate": 0.00014213151622957128,
"loss": 0.1502,
"step": 501
},
{
"epoch": 1.91,
"grad_norm": 0.5812751924824102,
"learning_rate": 0.00014190792540076986,
"loss": 0.1687,
"step": 502
},
{
"epoch": 1.91,
"grad_norm": 0.4627935359645996,
"learning_rate": 0.00014168408013786728,
"loss": 0.1246,
"step": 503
},
{
"epoch": 1.92,
"grad_norm": 0.4587837001229827,
"learning_rate": 0.00014145998179988735,
"loss": 0.1072,
"step": 504
},
{
"epoch": 1.92,
"grad_norm": 0.5196182852163257,
"learning_rate": 0.00014123563174739037,
"loss": 0.1269,
"step": 505
},
{
"epoch": 1.92,
"grad_norm": 0.42954431425652795,
"learning_rate": 0.0001410110313424648,
"loss": 0.1034,
"step": 506
},
{
"epoch": 1.93,
"grad_norm": 0.4551597635457604,
"learning_rate": 0.00014078618194871914,
"loss": 0.1084,
"step": 507
},
{
"epoch": 1.93,
"grad_norm": 0.5751209424815239,
"learning_rate": 0.0001405610849312736,
"loss": 0.136,
"step": 508
},
{
"epoch": 1.94,
"grad_norm": 0.5118033581542103,
"learning_rate": 0.00014033574165675164,
"loss": 0.1241,
"step": 509
},
{
"epoch": 1.94,
"grad_norm": 0.43644759709330094,
"learning_rate": 0.00014011015349327187,
"loss": 0.117,
"step": 510
},
{
"epoch": 1.94,
"grad_norm": 0.4958861591526709,
"learning_rate": 0.00013988432181043982,
"loss": 0.1163,
"step": 511
},
{
"epoch": 1.95,
"grad_norm": 0.44950223725406846,
"learning_rate": 0.00013965824797933926,
"loss": 0.1013,
"step": 512
},
{
"epoch": 1.95,
"grad_norm": 0.5174784327558367,
"learning_rate": 0.0001394319333725243,
"loss": 0.1228,
"step": 513
},
{
"epoch": 1.95,
"grad_norm": 0.451176763896539,
"learning_rate": 0.00013920537936401077,
"loss": 0.1137,
"step": 514
},
{
"epoch": 1.96,
"grad_norm": 0.35979415227278505,
"learning_rate": 0.00013897858732926793,
"loss": 0.0893,
"step": 515
},
{
"epoch": 1.96,
"grad_norm": 0.4087154672782133,
"learning_rate": 0.0001387515586452103,
"loss": 0.0979,
"step": 516
},
{
"epoch": 1.97,
"grad_norm": 0.4238917271153031,
"learning_rate": 0.000138524294690189,
"loss": 0.0902,
"step": 517
},
{
"epoch": 1.97,
"grad_norm": 0.37289193742062965,
"learning_rate": 0.00013829679684398375,
"loss": 0.0905,
"step": 518
},
{
"epoch": 1.97,
"grad_norm": 0.4561111955028976,
"learning_rate": 0.000138069066487794,
"loss": 0.1154,
"step": 519
},
{
"epoch": 1.98,
"grad_norm": 0.4795856880336275,
"learning_rate": 0.00013784110500423104,
"loss": 0.1053,
"step": 520
},
{
"epoch": 1.98,
"grad_norm": 0.5758379418361174,
"learning_rate": 0.00013761291377730936,
"loss": 0.1646,
"step": 521
},
{
"epoch": 1.98,
"grad_norm": 0.3967821645775129,
"learning_rate": 0.00013738449419243827,
"loss": 0.0797,
"step": 522
},
{
"epoch": 1.99,
"grad_norm": 0.5352641990707583,
"learning_rate": 0.00013715584763641345,
"loss": 0.1205,
"step": 523
},
{
"epoch": 1.99,
"grad_norm": 0.39673398503322155,
"learning_rate": 0.0001369269754974087,
"loss": 0.0672,
"step": 524
},
{
"epoch": 2.0,
"grad_norm": 0.5673600812406742,
"learning_rate": 0.00013669787916496722,
"loss": 0.1269,
"step": 525
},
{
"epoch": 2.0,
"grad_norm": 0.5134798249291933,
"learning_rate": 0.00013646856002999354,
"loss": 0.1275,
"step": 526
},
{
"epoch": 2.0,
"grad_norm": 0.26137163241501854,
"learning_rate": 0.00013623901948474473,
"loss": 0.0391,
"step": 527
},
{
"epoch": 2.01,
"grad_norm": 0.3318640399705981,
"learning_rate": 0.00013600925892282218,
"loss": 0.0414,
"step": 528
},
{
"epoch": 2.01,
"grad_norm": 0.39323951881930796,
"learning_rate": 0.00013577927973916306,
"loss": 0.054,
"step": 529
},
{
"epoch": 2.02,
"grad_norm": 0.31507008260792096,
"learning_rate": 0.0001355490833300318,
"loss": 0.042,
"step": 530
},
{
"epoch": 2.02,
"grad_norm": 0.351605629418437,
"learning_rate": 0.00013531867109301175,
"loss": 0.0419,
"step": 531
},
{
"epoch": 2.02,
"grad_norm": 0.300514728252508,
"learning_rate": 0.00013508804442699648,
"loss": 0.0432,
"step": 532
},
{
"epoch": 2.03,
"grad_norm": 0.429759783149188,
"learning_rate": 0.00013485720473218154,
"loss": 0.0434,
"step": 533
},
{
"epoch": 2.03,
"grad_norm": 0.3371850480060168,
"learning_rate": 0.00013462615341005573,
"loss": 0.0398,
"step": 534
},
{
"epoch": 2.03,
"grad_norm": 0.41727641506204793,
"learning_rate": 0.00013439489186339282,
"loss": 0.0532,
"step": 535
},
{
"epoch": 2.04,
"grad_norm": 0.4526462472237055,
"learning_rate": 0.0001341634214962428,
"loss": 0.0504,
"step": 536
},
{
"epoch": 2.04,
"grad_norm": 0.394759014800421,
"learning_rate": 0.00013393174371392348,
"loss": 0.0362,
"step": 537
},
{
"epoch": 2.05,
"grad_norm": 0.43530217979086694,
"learning_rate": 0.00013369985992301198,
"loss": 0.0581,
"step": 538
},
{
"epoch": 2.05,
"grad_norm": 0.3489279774411472,
"learning_rate": 0.00013346777153133615,
"loss": 0.0428,
"step": 539
},
{
"epoch": 2.05,
"grad_norm": 0.3524007670573337,
"learning_rate": 0.00013323547994796597,
"loss": 0.0392,
"step": 540
},
{
"epoch": 2.06,
"grad_norm": 0.4161180575138653,
"learning_rate": 0.00013300298658320517,
"loss": 0.0435,
"step": 541
},
{
"epoch": 2.06,
"grad_norm": 0.36047484402484764,
"learning_rate": 0.00013277029284858237,
"loss": 0.0453,
"step": 542
},
{
"epoch": 2.06,
"grad_norm": 0.3660355277365804,
"learning_rate": 0.00013253740015684284,
"loss": 0.0442,
"step": 543
},
{
"epoch": 2.07,
"grad_norm": 0.3239828127697563,
"learning_rate": 0.00013230430992193973,
"loss": 0.0405,
"step": 544
},
{
"epoch": 2.07,
"grad_norm": 0.4158448690843572,
"learning_rate": 0.00013207102355902552,
"loss": 0.0524,
"step": 545
},
{
"epoch": 2.08,
"grad_norm": 0.27764986772544564,
"learning_rate": 0.00013183754248444343,
"loss": 0.0332,
"step": 546
},
{
"epoch": 2.08,
"grad_norm": 0.32771023150803125,
"learning_rate": 0.00013160386811571876,
"loss": 0.0459,
"step": 547
},
{
"epoch": 2.08,
"grad_norm": 0.29940107859442794,
"learning_rate": 0.0001313700018715505,
"loss": 0.0377,
"step": 548
},
{
"epoch": 2.09,
"grad_norm": 0.3819582084310897,
"learning_rate": 0.00013113594517180242,
"loss": 0.058,
"step": 549
},
{
"epoch": 2.09,
"grad_norm": 0.32714441974400016,
"learning_rate": 0.00013090169943749476,
"loss": 0.0437,
"step": 550
},
{
"epoch": 2.1,
"grad_norm": 0.36220473718120416,
"learning_rate": 0.00013066726609079526,
"loss": 0.0425,
"step": 551
},
{
"epoch": 2.1,
"grad_norm": 0.3426348230655199,
"learning_rate": 0.00013043264655501074,
"loss": 0.0396,
"step": 552
},
{
"epoch": 2.1,
"grad_norm": 0.32235085289402154,
"learning_rate": 0.00013019784225457855,
"loss": 0.0382,
"step": 553
},
{
"epoch": 2.11,
"grad_norm": 0.2964086383376768,
"learning_rate": 0.0001299628546150577,
"loss": 0.0382,
"step": 554
},
{
"epoch": 2.11,
"grad_norm": 0.27015454300882835,
"learning_rate": 0.00012972768506312027,
"loss": 0.0283,
"step": 555
},
{
"epoch": 2.11,
"grad_norm": 0.33035662649730047,
"learning_rate": 0.00012949233502654284,
"loss": 0.0384,
"step": 556
},
{
"epoch": 2.12,
"grad_norm": 0.3187428949344172,
"learning_rate": 0.00012925680593419778,
"loss": 0.032,
"step": 557
},
{
"epoch": 2.12,
"grad_norm": 0.3399662779151198,
"learning_rate": 0.00012902109921604448,
"loss": 0.0405,
"step": 558
},
{
"epoch": 2.13,
"grad_norm": 0.35365859482095496,
"learning_rate": 0.00012878521630312078,
"loss": 0.0397,
"step": 559
},
{
"epoch": 2.13,
"grad_norm": 0.30675985331556893,
"learning_rate": 0.00012854915862753422,
"loss": 0.035,
"step": 560
},
{
"epoch": 2.13,
"grad_norm": 0.414074353684469,
"learning_rate": 0.0001283129276224534,
"loss": 0.052,
"step": 561
},
{
"epoch": 2.14,
"grad_norm": 0.38344430660186296,
"learning_rate": 0.0001280765247220993,
"loss": 0.0359,
"step": 562
},
{
"epoch": 2.14,
"grad_norm": 0.3137291177532792,
"learning_rate": 0.0001278399513617364,
"loss": 0.031,
"step": 563
},
{
"epoch": 2.14,
"grad_norm": 0.39542817136382147,
"learning_rate": 0.0001276032089776642,
"loss": 0.0438,
"step": 564
},
{
"epoch": 2.15,
"grad_norm": 0.3778319793304532,
"learning_rate": 0.0001273662990072083,
"loss": 0.0485,
"step": 565
},
{
"epoch": 2.15,
"grad_norm": 0.35242405330198695,
"learning_rate": 0.0001271292228887118,
"loss": 0.0395,
"step": 566
},
{
"epoch": 2.16,
"grad_norm": 0.24773421468492,
"learning_rate": 0.00012689198206152657,
"loss": 0.0205,
"step": 567
},
{
"epoch": 2.16,
"grad_norm": 0.37351452341587044,
"learning_rate": 0.00012665457796600443,
"loss": 0.0403,
"step": 568
},
{
"epoch": 2.16,
"grad_norm": 0.29584266385998476,
"learning_rate": 0.0001264170120434884,
"loss": 0.0332,
"step": 569
},
{
"epoch": 2.17,
"grad_norm": 0.28547841885732445,
"learning_rate": 0.00012617928573630406,
"loss": 0.0288,
"step": 570
},
{
"epoch": 2.17,
"grad_norm": 0.37964584553416536,
"learning_rate": 0.0001259414004877507,
"loss": 0.0348,
"step": 571
},
{
"epoch": 2.17,
"grad_norm": 0.36767195929723195,
"learning_rate": 0.0001257033577420926,
"loss": 0.0404,
"step": 572
},
{
"epoch": 2.18,
"grad_norm": 0.3509417575248461,
"learning_rate": 0.00012546515894455026,
"loss": 0.0373,
"step": 573
},
{
"epoch": 2.18,
"grad_norm": 0.2848574183498485,
"learning_rate": 0.00012522680554129156,
"loss": 0.0338,
"step": 574
},
{
"epoch": 2.19,
"grad_norm": 0.3019415495988161,
"learning_rate": 0.0001249882989794231,
"loss": 0.0342,
"step": 575
},
{
"epoch": 2.19,
"grad_norm": 0.2990668841044976,
"learning_rate": 0.00012474964070698127,
"loss": 0.0335,
"step": 576
},
{
"epoch": 2.19,
"grad_norm": 0.4328132625325146,
"learning_rate": 0.00012451083217292357,
"loss": 0.0394,
"step": 577
},
{
"epoch": 2.2,
"grad_norm": 0.4292821742979654,
"learning_rate": 0.00012427187482711986,
"loss": 0.0516,
"step": 578
},
{
"epoch": 2.2,
"grad_norm": 0.2686010674908365,
"learning_rate": 0.0001240327701203433,
"loss": 0.0344,
"step": 579
},
{
"epoch": 2.21,
"grad_norm": 0.28452697548238837,
"learning_rate": 0.00012379351950426187,
"loss": 0.0307,
"step": 580
},
{
"epoch": 2.21,
"grad_norm": 0.3597827634214583,
"learning_rate": 0.00012355412443142936,
"loss": 0.0429,
"step": 581
},
{
"epoch": 2.21,
"grad_norm": 0.31337666682629667,
"learning_rate": 0.00012331458635527658,
"loss": 0.038,
"step": 582
},
{
"epoch": 2.22,
"grad_norm": 0.3751035825027268,
"learning_rate": 0.0001230749067301025,
"loss": 0.0385,
"step": 583
},
{
"epoch": 2.22,
"grad_norm": 0.3982139708950298,
"learning_rate": 0.00012283508701106557,
"loss": 0.0461,
"step": 584
},
{
"epoch": 2.22,
"grad_norm": 0.3672383693603079,
"learning_rate": 0.00012259512865417477,
"loss": 0.045,
"step": 585
},
{
"epoch": 2.23,
"grad_norm": 0.3836978621116552,
"learning_rate": 0.00012235503311628073,
"loss": 0.0434,
"step": 586
},
{
"epoch": 2.23,
"grad_norm": 0.3661127438866315,
"learning_rate": 0.00012211480185506698,
"loss": 0.0374,
"step": 587
},
{
"epoch": 2.24,
"grad_norm": 0.39751744692779245,
"learning_rate": 0.00012187443632904105,
"loss": 0.0334,
"step": 588
},
{
"epoch": 2.24,
"grad_norm": 0.2943198017134509,
"learning_rate": 0.00012163393799752565,
"loss": 0.0354,
"step": 589
},
{
"epoch": 2.24,
"grad_norm": 0.3002324681197963,
"learning_rate": 0.00012139330832064974,
"loss": 0.0263,
"step": 590
},
{
"epoch": 2.25,
"grad_norm": 0.29049707177485556,
"learning_rate": 0.00012115254875933979,
"loss": 0.0374,
"step": 591
},
{
"epoch": 2.25,
"grad_norm": 0.36487904735548193,
"learning_rate": 0.00012091166077531075,
"loss": 0.0366,
"step": 592
},
{
"epoch": 2.25,
"grad_norm": 0.6001906832098756,
"learning_rate": 0.00012067064583105729,
"loss": 0.0397,
"step": 593
},
{
"epoch": 2.26,
"grad_norm": 0.29594265959931015,
"learning_rate": 0.00012042950538984492,
"loss": 0.0339,
"step": 594
},
{
"epoch": 2.26,
"grad_norm": 0.3136944220749086,
"learning_rate": 0.00012018824091570103,
"loss": 0.0383,
"step": 595
},
{
"epoch": 2.27,
"grad_norm": 0.34881594693482015,
"learning_rate": 0.00011994685387340607,
"loss": 0.0328,
"step": 596
},
{
"epoch": 2.27,
"grad_norm": 0.343330706863882,
"learning_rate": 0.00011970534572848464,
"loss": 0.0402,
"step": 597
},
{
"epoch": 2.27,
"grad_norm": 0.34079159501332024,
"learning_rate": 0.00011946371794719656,
"loss": 0.0351,
"step": 598
},
{
"epoch": 2.28,
"grad_norm": 0.2827419424824256,
"learning_rate": 0.000119221971996528,
"loss": 0.0257,
"step": 599
},
{
"epoch": 2.28,
"grad_norm": 0.3123401643495521,
"learning_rate": 0.0001189801093441826,
"loss": 0.029,
"step": 600
},
{
"epoch": 2.28,
"eval_blimp_filtered_avg": 0.7173134328358209,
"eval_blimp_filtered_std": 0.005026688908914533,
"step": 600
},
{
"epoch": 2.28,
"eval_blimp_supplement_avg": 0.8254310344827587,
"eval_blimp_supplement_std": 0.017008878963692253,
"step": 600
},
{
"epoch": 2.28,
"eval_vqa_filtered_avg": 0.5,
"eval_vqa_filtered_std": 0.050251890762960605,
"step": 600
},
{
"epoch": 2.28,
"eval_winoground_filtered_avg": 0.65,
"eval_winoground_filtered_std": 0.04793724854411019,
"step": 600
},
{
"epoch": 2.29,
"grad_norm": 0.3284360674259941,
"learning_rate": 0.00011873813145857249,
"loss": 0.0307,
"step": 601
},
{
"epoch": 2.29,
"grad_norm": 0.4088320006622324,
"learning_rate": 0.0001184960398088094,
"loss": 0.0424,
"step": 602
},
{
"epoch": 2.29,
"grad_norm": 0.3347370845203778,
"learning_rate": 0.00011825383586469583,
"loss": 0.0301,
"step": 603
},
{
"epoch": 2.3,
"grad_norm": 0.34998604945319767,
"learning_rate": 0.00011801152109671595,
"loss": 0.0398,
"step": 604
},
{
"epoch": 2.3,
"grad_norm": 0.28072401871679603,
"learning_rate": 0.00011776909697602689,
"loss": 0.0207,
"step": 605
},
{
"epoch": 2.3,
"grad_norm": 0.27417539486849196,
"learning_rate": 0.00011752656497444952,
"loss": 0.0303,
"step": 606
},
{
"epoch": 2.31,
"grad_norm": 0.3528956989148332,
"learning_rate": 0.00011728392656445981,
"loss": 0.035,
"step": 607
},
{
"epoch": 2.31,
"grad_norm": 0.2732570327379083,
"learning_rate": 0.00011704118321917976,
"loss": 0.0293,
"step": 608
},
{
"epoch": 2.32,
"grad_norm": 0.23173712745705208,
"learning_rate": 0.00011679833641236844,
"loss": 0.0194,
"step": 609
},
{
"epoch": 2.32,
"grad_norm": 0.3833499067600359,
"learning_rate": 0.000116555387618413,
"loss": 0.0301,
"step": 610
},
{
"epoch": 2.32,
"grad_norm": 0.3445447080613637,
"learning_rate": 0.00011631233831231991,
"loss": 0.0394,
"step": 611
},
{
"epoch": 2.33,
"grad_norm": 0.3521386004172123,
"learning_rate": 0.00011606918996970573,
"loss": 0.0359,
"step": 612
},
{
"epoch": 2.33,
"grad_norm": 0.32961191495589676,
"learning_rate": 0.00011582594406678839,
"loss": 0.0353,
"step": 613
},
{
"epoch": 2.33,
"grad_norm": 0.32438381236368063,
"learning_rate": 0.00011558260208037817,
"loss": 0.0326,
"step": 614
},
{
"epoch": 2.34,
"grad_norm": 0.31086642828231076,
"learning_rate": 0.00011533916548786857,
"loss": 0.0319,
"step": 615
},
{
"epoch": 2.34,
"grad_norm": 0.3725691312752257,
"learning_rate": 0.00011509563576722753,
"loss": 0.0419,
"step": 616
},
{
"epoch": 2.35,
"grad_norm": 0.2712234441709734,
"learning_rate": 0.00011485201439698847,
"loss": 0.0286,
"step": 617
},
{
"epoch": 2.35,
"grad_norm": 0.2992546512802102,
"learning_rate": 0.00011460830285624118,
"loss": 0.0298,
"step": 618
},
{
"epoch": 2.35,
"grad_norm": 0.3740397615866031,
"learning_rate": 0.00011436450262462284,
"loss": 0.0434,
"step": 619
},
{
"epoch": 2.36,
"grad_norm": 0.34323442855123726,
"learning_rate": 0.00011412061518230914,
"loss": 0.038,
"step": 620
},
{
"epoch": 2.36,
"grad_norm": 0.2859959872205353,
"learning_rate": 0.00011387664201000532,
"loss": 0.0337,
"step": 621
},
{
"epoch": 2.37,
"grad_norm": 0.35126159863306816,
"learning_rate": 0.00011363258458893699,
"loss": 0.0418,
"step": 622
},
{
"epoch": 2.37,
"grad_norm": 0.36586330790757604,
"learning_rate": 0.00011338844440084138,
"loss": 0.0472,
"step": 623
},
{
"epoch": 2.37,
"grad_norm": 0.3515588484900087,
"learning_rate": 0.0001131442229279581,
"loss": 0.0343,
"step": 624
},
{
"epoch": 2.38,
"grad_norm": 0.26331852309035336,
"learning_rate": 0.00011289992165302035,
"loss": 0.0269,
"step": 625
},
{
"epoch": 2.38,
"grad_norm": 0.304600621064075,
"learning_rate": 0.00011265554205924575,
"loss": 0.0312,
"step": 626
},
{
"epoch": 2.38,
"grad_norm": 0.29191144254029494,
"learning_rate": 0.0001124110856303275,
"loss": 0.0351,
"step": 627
},
{
"epoch": 2.39,
"grad_norm": 0.39129462924046776,
"learning_rate": 0.00011216655385042525,
"loss": 0.035,
"step": 628
},
{
"epoch": 2.39,
"grad_norm": 0.38533876670236805,
"learning_rate": 0.00011192194820415609,
"loss": 0.0398,
"step": 629
},
{
"epoch": 2.4,
"grad_norm": 0.2395517368524462,
"learning_rate": 0.00011167727017658562,
"loss": 0.0305,
"step": 630
},
{
"epoch": 2.4,
"grad_norm": 0.2882756076173429,
"learning_rate": 0.00011143252125321892,
"loss": 0.028,
"step": 631
},
{
"epoch": 2.4,
"grad_norm": 0.3143279186770457,
"learning_rate": 0.00011118770291999137,
"loss": 0.0349,
"step": 632
},
{
"epoch": 2.41,
"grad_norm": 0.2851418078470389,
"learning_rate": 0.00011094281666325988,
"loss": 0.0274,
"step": 633
},
{
"epoch": 2.41,
"grad_norm": 0.36040663639869686,
"learning_rate": 0.00011069786396979367,
"loss": 0.0357,
"step": 634
},
{
"epoch": 2.41,
"grad_norm": 0.33153007014041297,
"learning_rate": 0.00011045284632676536,
"loss": 0.0339,
"step": 635
},
{
"epoch": 2.42,
"grad_norm": 0.28195191033726724,
"learning_rate": 0.00011020776522174186,
"loss": 0.0265,
"step": 636
},
{
"epoch": 2.42,
"grad_norm": 0.29390455843884594,
"learning_rate": 0.0001099626221426754,
"loss": 0.031,
"step": 637
},
{
"epoch": 2.43,
"grad_norm": 0.38900881293341505,
"learning_rate": 0.0001097174185778945,
"loss": 0.0277,
"step": 638
},
{
"epoch": 2.43,
"grad_norm": 0.3393893043389277,
"learning_rate": 0.00010947215601609479,
"loss": 0.0373,
"step": 639
},
{
"epoch": 2.43,
"grad_norm": 0.2908885670583566,
"learning_rate": 0.00010922683594633021,
"loss": 0.0275,
"step": 640
},
{
"epoch": 2.44,
"grad_norm": 0.32162535341655224,
"learning_rate": 0.00010898145985800381,
"loss": 0.0296,
"step": 641
},
{
"epoch": 2.44,
"grad_norm": 0.3101819804262916,
"learning_rate": 0.00010873602924085869,
"loss": 0.0279,
"step": 642
},
{
"epoch": 2.44,
"grad_norm": 0.3225846056983095,
"learning_rate": 0.00010849054558496905,
"loss": 0.0359,
"step": 643
},
{
"epoch": 2.45,
"grad_norm": 0.3320709262623266,
"learning_rate": 0.00010824501038073116,
"loss": 0.0272,
"step": 644
},
{
"epoch": 2.45,
"grad_norm": 0.34267032779621814,
"learning_rate": 0.00010799942511885418,
"loss": 0.0349,
"step": 645
},
{
"epoch": 2.46,
"grad_norm": 0.3252434307442351,
"learning_rate": 0.00010775379129035116,
"loss": 0.0353,
"step": 646
},
{
"epoch": 2.46,
"grad_norm": 0.3424342161187399,
"learning_rate": 0.00010750811038653008,
"loss": 0.0395,
"step": 647
},
{
"epoch": 2.46,
"grad_norm": 0.3350229460047768,
"learning_rate": 0.00010726238389898471,
"loss": 0.0311,
"step": 648
},
{
"epoch": 2.47,
"grad_norm": 0.3333212059527241,
"learning_rate": 0.00010701661331958553,
"loss": 0.0305,
"step": 649
},
{
"epoch": 2.47,
"grad_norm": 0.342236697340712,
"learning_rate": 0.00010677080014047076,
"loss": 0.0387,
"step": 650
},
{
"epoch": 2.48,
"grad_norm": 0.31813476077439445,
"learning_rate": 0.00010652494585403725,
"loss": 0.0332,
"step": 651
},
{
"epoch": 2.48,
"grad_norm": 0.34237738807094575,
"learning_rate": 0.00010627905195293135,
"loss": 0.0305,
"step": 652
},
{
"epoch": 2.48,
"grad_norm": 0.40607599891255564,
"learning_rate": 0.00010603311993004004,
"loss": 0.0394,
"step": 653
},
{
"epoch": 2.49,
"grad_norm": 0.29414538244181715,
"learning_rate": 0.00010578715127848167,
"loss": 0.03,
"step": 654
},
{
"epoch": 2.49,
"grad_norm": 0.37329000324002554,
"learning_rate": 0.000105541147491597,
"loss": 0.0421,
"step": 655
},
{
"epoch": 2.49,
"grad_norm": 0.3522250044405319,
"learning_rate": 0.00010529511006294009,
"loss": 0.045,
"step": 656
},
{
"epoch": 2.5,
"grad_norm": 0.28121115420821313,
"learning_rate": 0.00010504904048626925,
"loss": 0.0292,
"step": 657
},
{
"epoch": 2.5,
"grad_norm": 0.26325089300460475,
"learning_rate": 0.00010480294025553798,
"loss": 0.0292,
"step": 658
},
{
"epoch": 2.51,
"grad_norm": 0.2719725064408085,
"learning_rate": 0.00010455681086488586,
"loss": 0.0231,
"step": 659
},
{
"epoch": 2.51,
"grad_norm": 0.3453111024050063,
"learning_rate": 0.00010431065380862959,
"loss": 0.0378,
"step": 660
},
{
"epoch": 2.51,
"grad_norm": 0.27775866655685405,
"learning_rate": 0.00010406447058125368,
"loss": 0.0286,
"step": 661
},
{
"epoch": 2.52,
"grad_norm": 0.33965483172047317,
"learning_rate": 0.00010381826267740171,
"loss": 0.0333,
"step": 662
},
{
"epoch": 2.52,
"grad_norm": 0.27735302965276887,
"learning_rate": 0.00010357203159186694,
"loss": 0.0271,
"step": 663
},
{
"epoch": 2.52,
"grad_norm": 0.3359647684514768,
"learning_rate": 0.0001033257788195835,
"loss": 0.0399,
"step": 664
},
{
"epoch": 2.53,
"grad_norm": 0.31390916758505816,
"learning_rate": 0.00010307950585561706,
"loss": 0.0259,
"step": 665
},
{
"epoch": 2.53,
"grad_norm": 0.26289076188241683,
"learning_rate": 0.0001028332141951559,
"loss": 0.0303,
"step": 666
},
{
"epoch": 2.54,
"grad_norm": 0.4074393150808,
"learning_rate": 0.0001025869053335019,
"loss": 0.0339,
"step": 667
},
{
"epoch": 2.54,
"grad_norm": 0.254687354277676,
"learning_rate": 0.0001023405807660613,
"loss": 0.0263,
"step": 668
},
{
"epoch": 2.54,
"grad_norm": 0.31504128834532047,
"learning_rate": 0.0001020942419883357,
"loss": 0.0308,
"step": 669
},
{
"epoch": 2.55,
"grad_norm": 0.3328845285750464,
"learning_rate": 0.00010184789049591299,
"loss": 0.0355,
"step": 670
},
{
"epoch": 2.55,
"grad_norm": 0.31546120267640665,
"learning_rate": 0.00010160152778445829,
"loss": 0.0352,
"step": 671
},
{
"epoch": 2.56,
"grad_norm": 0.2983572052164693,
"learning_rate": 0.0001013551553497047,
"loss": 0.0293,
"step": 672
},
{
"epoch": 2.56,
"grad_norm": 0.31011068039080025,
"learning_rate": 0.0001011087746874445,
"loss": 0.0354,
"step": 673
},
{
"epoch": 2.56,
"grad_norm": 0.2987245112804117,
"learning_rate": 0.00010086238729351988,
"loss": 0.0354,
"step": 674
},
{
"epoch": 2.57,
"grad_norm": 0.3234038283715016,
"learning_rate": 0.00010061599466381389,
"loss": 0.0287,
"step": 675
},
{
"epoch": 2.57,
"grad_norm": 0.27880614367112977,
"learning_rate": 0.00010036959829424131,
"loss": 0.0249,
"step": 676
},
{
"epoch": 2.57,
"grad_norm": 0.3435335426279227,
"learning_rate": 0.0001001231996807397,
"loss": 0.0363,
"step": 677
},
{
"epoch": 2.58,
"grad_norm": 0.3166036055942454,
"learning_rate": 9.987680031926032e-05,
"loss": 0.0302,
"step": 678
},
{
"epoch": 2.58,
"grad_norm": 0.2894910254443763,
"learning_rate": 9.96304017057587e-05,
"loss": 0.0285,
"step": 679
},
{
"epoch": 2.59,
"grad_norm": 0.34201749143530447,
"learning_rate": 9.938400533618615e-05,
"loss": 0.0443,
"step": 680
},
{
"epoch": 2.59,
"grad_norm": 0.3740842570087233,
"learning_rate": 9.913761270648015e-05,
"loss": 0.0303,
"step": 681
},
{
"epoch": 2.59,
"grad_norm": 0.3215036425488617,
"learning_rate": 9.889122531255552e-05,
"loss": 0.0383,
"step": 682
},
{
"epoch": 2.6,
"grad_norm": 0.29139970169902235,
"learning_rate": 9.864484465029536e-05,
"loss": 0.0266,
"step": 683
},
{
"epoch": 2.6,
"grad_norm": 0.26997285345644567,
"learning_rate": 9.839847221554175e-05,
"loss": 0.0271,
"step": 684
},
{
"epoch": 2.6,
"grad_norm": 0.37692961834447125,
"learning_rate": 9.815210950408704e-05,
"loss": 0.0409,
"step": 685
},
{
"epoch": 2.61,
"grad_norm": 0.34286292595950857,
"learning_rate": 9.790575801166432e-05,
"loss": 0.0373,
"step": 686
},
{
"epoch": 2.61,
"grad_norm": 0.3255553751579432,
"learning_rate": 9.765941923393874e-05,
"loss": 0.0272,
"step": 687
},
{
"epoch": 2.62,
"grad_norm": 0.3158220004892916,
"learning_rate": 9.741309466649813e-05,
"loss": 0.0269,
"step": 688
},
{
"epoch": 2.62,
"grad_norm": 0.2736162407266226,
"learning_rate": 9.716678580484411e-05,
"loss": 0.0247,
"step": 689
},
{
"epoch": 2.62,
"grad_norm": 0.337905173595386,
"learning_rate": 9.692049414438299e-05,
"loss": 0.0378,
"step": 690
},
{
"epoch": 2.63,
"grad_norm": 0.3258813431481511,
"learning_rate": 9.667422118041651e-05,
"loss": 0.0276,
"step": 691
},
{
"epoch": 2.63,
"grad_norm": 0.31627150266086607,
"learning_rate": 9.642796840813308e-05,
"loss": 0.0246,
"step": 692
},
{
"epoch": 2.63,
"grad_norm": 0.32285673653685554,
"learning_rate": 9.61817373225983e-05,
"loss": 0.0256,
"step": 693
},
{
"epoch": 2.64,
"grad_norm": 0.3021398612416659,
"learning_rate": 9.593552941874635e-05,
"loss": 0.0256,
"step": 694
},
{
"epoch": 2.64,
"grad_norm": 0.2771885920682674,
"learning_rate": 9.568934619137046e-05,
"loss": 0.0236,
"step": 695
},
{
"epoch": 2.65,
"grad_norm": 0.2913089871269692,
"learning_rate": 9.544318913511416e-05,
"loss": 0.0276,
"step": 696
},
{
"epoch": 2.65,
"grad_norm": 0.29943649863489186,
"learning_rate": 9.519705974446207e-05,
"loss": 0.0281,
"step": 697
},
{
"epoch": 2.65,
"grad_norm": 0.28658510456203773,
"learning_rate": 9.495095951373076e-05,
"loss": 0.0222,
"step": 698
},
{
"epoch": 2.66,
"grad_norm": 0.32216521722606845,
"learning_rate": 9.470488993705992e-05,
"loss": 0.0275,
"step": 699
},
{
"epoch": 2.66,
"grad_norm": 0.3534602351211303,
"learning_rate": 9.4458852508403e-05,
"loss": 0.0349,
"step": 700
},
{
"epoch": 2.66,
"eval_blimp_filtered_avg": 0.7164179104477612,
"eval_blimp_filtered_std": 0.005041707326458033,
"step": 700
},
{
"epoch": 2.66,
"eval_blimp_supplement_avg": 0.8297413793103449,
"eval_blimp_supplement_std": 0.016828487437818656,
"step": 700
},
{
"epoch": 2.66,
"eval_vqa_filtered_avg": 0.5,
"eval_vqa_filtered_std": 0.050251890762960605,
"step": 700
},
{
"epoch": 2.66,
"eval_winoground_filtered_avg": 0.66,
"eval_winoground_filtered_std": 0.04760952285695238,
"step": 700
},
{
"epoch": 2.67,
"grad_norm": 0.38483480730152236,
"learning_rate": 9.421284872151836e-05,
"loss": 0.0447,
"step": 701
},
{
"epoch": 2.67,
"grad_norm": 0.2691349907593638,
"learning_rate": 9.396688006996e-05,
"loss": 0.0275,
"step": 702
},
{
"epoch": 2.67,
"grad_norm": 0.30394270832959963,
"learning_rate": 9.372094804706867e-05,
"loss": 0.0245,
"step": 703
},
{
"epoch": 2.68,
"grad_norm": 0.3894950215679374,
"learning_rate": 9.34750541459628e-05,
"loss": 0.0328,
"step": 704
},
{
"epoch": 2.68,
"grad_norm": 0.3066861317584869,
"learning_rate": 9.322919985952926e-05,
"loss": 0.0257,
"step": 705
},
{
"epoch": 2.68,
"grad_norm": 0.2839567453054656,
"learning_rate": 9.298338668041451e-05,
"loss": 0.0301,
"step": 706
},
{
"epoch": 2.69,
"grad_norm": 0.3037116435680821,
"learning_rate": 9.27376161010153e-05,
"loss": 0.0221,
"step": 707
},
{
"epoch": 2.69,
"grad_norm": 0.3134542478986076,
"learning_rate": 9.249188961346993e-05,
"loss": 0.0338,
"step": 708
},
{
"epoch": 2.7,
"grad_norm": 0.3541455631807929,
"learning_rate": 9.224620870964886e-05,
"loss": 0.0355,
"step": 709
},
{
"epoch": 2.7,
"grad_norm": 0.3115683125480853,
"learning_rate": 9.200057488114585e-05,
"loss": 0.0252,
"step": 710
},
{
"epoch": 2.7,
"grad_norm": 0.27292900060486747,
"learning_rate": 9.175498961926886e-05,
"loss": 0.021,
"step": 711
},
{
"epoch": 2.71,
"grad_norm": 0.28750762243690786,
"learning_rate": 9.150945441503093e-05,
"loss": 0.0262,
"step": 712
},
{
"epoch": 2.71,
"grad_norm": 0.3131812214087709,
"learning_rate": 9.126397075914135e-05,
"loss": 0.032,
"step": 713
},
{
"epoch": 2.71,
"grad_norm": 0.2503186984511136,
"learning_rate": 9.101854014199622e-05,
"loss": 0.0222,
"step": 714
},
{
"epoch": 2.72,
"grad_norm": 0.26585317166475225,
"learning_rate": 9.077316405366981e-05,
"loss": 0.0237,
"step": 715
},
{
"epoch": 2.72,
"grad_norm": 0.30126199934192793,
"learning_rate": 9.052784398390525e-05,
"loss": 0.0294,
"step": 716
},
{
"epoch": 2.73,
"grad_norm": 0.2903130573653526,
"learning_rate": 9.028258142210552e-05,
"loss": 0.0294,
"step": 717
},
{
"epoch": 2.73,
"grad_norm": 0.27174816407170116,
"learning_rate": 9.00373778573246e-05,
"loss": 0.0269,
"step": 718
},
{
"epoch": 2.73,
"grad_norm": 0.3629515708975863,
"learning_rate": 8.979223477825814e-05,
"loss": 0.0281,
"step": 719
},
{
"epoch": 2.74,
"grad_norm": 0.3714998200084635,
"learning_rate": 8.954715367323468e-05,
"loss": 0.0331,
"step": 720
},
{
"epoch": 2.74,
"grad_norm": 0.3387130445188704,
"learning_rate": 8.930213603020638e-05,
"loss": 0.0349,
"step": 721
},
{
"epoch": 2.75,
"grad_norm": 0.33356330455180916,
"learning_rate": 8.905718333674013e-05,
"loss": 0.0323,
"step": 722
},
{
"epoch": 2.75,
"grad_norm": 0.3443287065763936,
"learning_rate": 8.881229708000865e-05,
"loss": 0.0314,
"step": 723
},
{
"epoch": 2.75,
"grad_norm": 0.36589381487249345,
"learning_rate": 8.85674787467811e-05,
"loss": 0.0331,
"step": 724
},
{
"epoch": 2.76,
"grad_norm": 0.33686969776224657,
"learning_rate": 8.832272982341439e-05,
"loss": 0.0347,
"step": 725
},
{
"epoch": 2.76,
"grad_norm": 0.2821892144697015,
"learning_rate": 8.80780517958439e-05,
"loss": 0.029,
"step": 726
},
{
"epoch": 2.76,
"grad_norm": 0.3370413152254308,
"learning_rate": 8.783344614957477e-05,
"loss": 0.0309,
"step": 727
},
{
"epoch": 2.77,
"grad_norm": 0.27254650378131945,
"learning_rate": 8.758891436967252e-05,
"loss": 0.0206,
"step": 728
},
{
"epoch": 2.77,
"grad_norm": 0.344842568070324,
"learning_rate": 8.734445794075428e-05,
"loss": 0.0331,
"step": 729
},
{
"epoch": 2.78,
"grad_norm": 0.3263812096586238,
"learning_rate": 8.710007834697969e-05,
"loss": 0.0323,
"step": 730
},
{
"epoch": 2.78,
"grad_norm": 0.24908057089857247,
"learning_rate": 8.68557770720419e-05,
"loss": 0.024,
"step": 731
},
{
"epoch": 2.78,
"grad_norm": 0.2829456218767608,
"learning_rate": 8.661155559915863e-05,
"loss": 0.0258,
"step": 732
},
{
"epoch": 2.79,
"grad_norm": 0.2992177526407328,
"learning_rate": 8.636741541106299e-05,
"loss": 0.0258,
"step": 733
},
{
"epoch": 2.79,
"grad_norm": 0.3498647491010859,
"learning_rate": 8.61233579899947e-05,
"loss": 0.0309,
"step": 734
},
{
"epoch": 2.79,
"grad_norm": 0.31399097802426235,
"learning_rate": 8.587938481769089e-05,
"loss": 0.0391,
"step": 735
},
{
"epoch": 2.8,
"grad_norm": 0.28430578853728555,
"learning_rate": 8.563549737537719e-05,
"loss": 0.026,
"step": 736
},
{
"epoch": 2.8,
"grad_norm": 0.2523093762154707,
"learning_rate": 8.539169714375885e-05,
"loss": 0.0212,
"step": 737
},
{
"epoch": 2.81,
"grad_norm": 0.29858190065300216,
"learning_rate": 8.514798560301152e-05,
"loss": 0.0339,
"step": 738
},
{
"epoch": 2.81,
"grad_norm": 0.2536866212590354,
"learning_rate": 8.490436423277248e-05,
"loss": 0.0254,
"step": 739
},
{
"epoch": 2.81,
"grad_norm": 0.2924064432366086,
"learning_rate": 8.466083451213144e-05,
"loss": 0.0293,
"step": 740
},
{
"epoch": 2.82,
"grad_norm": 0.2981743418161694,
"learning_rate": 8.441739791962187e-05,
"loss": 0.0258,
"step": 741
},
{
"epoch": 2.82,
"grad_norm": 0.2812452863028169,
"learning_rate": 8.417405593321163e-05,
"loss": 0.0258,
"step": 742
},
{
"epoch": 2.83,
"grad_norm": 0.28479487575175905,
"learning_rate": 8.393081003029431e-05,
"loss": 0.0256,
"step": 743
},
{
"epoch": 2.83,
"grad_norm": 0.34570399590977596,
"learning_rate": 8.368766168768014e-05,
"loss": 0.0325,
"step": 744
},
{
"epoch": 2.83,
"grad_norm": 0.3337375710597649,
"learning_rate": 8.344461238158699e-05,
"loss": 0.0327,
"step": 745
},
{
"epoch": 2.84,
"grad_norm": 0.2986872900474696,
"learning_rate": 8.320166358763159e-05,
"loss": 0.0217,
"step": 746
},
{
"epoch": 2.84,
"grad_norm": 0.28712435596387936,
"learning_rate": 8.295881678082024e-05,
"loss": 0.0256,
"step": 747
},
{
"epoch": 2.84,
"grad_norm": 0.3272426704832391,
"learning_rate": 8.271607343554021e-05,
"loss": 0.0246,
"step": 748
},
{
"epoch": 2.85,
"grad_norm": 0.3092816463373921,
"learning_rate": 8.247343502555053e-05,
"loss": 0.0304,
"step": 749
},
{
"epoch": 2.85,
"grad_norm": 0.3000320928728694,
"learning_rate": 8.223090302397313e-05,
"loss": 0.0246,
"step": 750
},
{
"epoch": 2.86,
"grad_norm": 0.2990018271701567,
"learning_rate": 8.198847890328406e-05,
"loss": 0.0301,
"step": 751
},
{
"epoch": 2.86,
"grad_norm": 0.3674125202368627,
"learning_rate": 8.174616413530418e-05,
"loss": 0.0294,
"step": 752
},
{
"epoch": 2.86,
"grad_norm": 0.32604533269795055,
"learning_rate": 8.150396019119062e-05,
"loss": 0.0345,
"step": 753
},
{
"epoch": 2.87,
"grad_norm": 0.283530884706442,
"learning_rate": 8.126186854142752e-05,
"loss": 0.0193,
"step": 754
},
{
"epoch": 2.87,
"grad_norm": 0.25164065278234365,
"learning_rate": 8.101989065581743e-05,
"loss": 0.0184,
"step": 755
},
{
"epoch": 2.87,
"grad_norm": 0.25607430685124377,
"learning_rate": 8.077802800347205e-05,
"loss": 0.0199,
"step": 756
},
{
"epoch": 2.88,
"grad_norm": 0.2934139007796417,
"learning_rate": 8.053628205280347e-05,
"loss": 0.0234,
"step": 757
},
{
"epoch": 2.88,
"grad_norm": 0.30060563019331127,
"learning_rate": 8.029465427151538e-05,
"loss": 0.0243,
"step": 758
},
{
"epoch": 2.89,
"grad_norm": 0.31584329920715304,
"learning_rate": 8.005314612659393e-05,
"loss": 0.0233,
"step": 759
},
{
"epoch": 2.89,
"grad_norm": 0.3399830490651399,
"learning_rate": 7.9811759084299e-05,
"loss": 0.0303,
"step": 760
},
{
"epoch": 2.89,
"grad_norm": 0.23710075133737013,
"learning_rate": 7.957049461015512e-05,
"loss": 0.0201,
"step": 761
},
{
"epoch": 2.9,
"grad_norm": 0.291414714994268,
"learning_rate": 7.932935416894272e-05,
"loss": 0.0261,
"step": 762
},
{
"epoch": 2.9,
"grad_norm": 0.25516874264704126,
"learning_rate": 7.908833922468927e-05,
"loss": 0.0209,
"step": 763
},
{
"epoch": 2.9,
"grad_norm": 0.3225236014785536,
"learning_rate": 7.884745124066023e-05,
"loss": 0.0293,
"step": 764
},
{
"epoch": 2.91,
"grad_norm": 0.33073323071844557,
"learning_rate": 7.860669167935028e-05,
"loss": 0.0312,
"step": 765
},
{
"epoch": 2.91,
"grad_norm": 0.2747460833033659,
"learning_rate": 7.836606200247436e-05,
"loss": 0.0249,
"step": 766
},
{
"epoch": 2.92,
"grad_norm": 0.22799207899871587,
"learning_rate": 7.812556367095896e-05,
"loss": 0.0236,
"step": 767
},
{
"epoch": 2.92,
"grad_norm": 0.2957004620104486,
"learning_rate": 7.788519814493304e-05,
"loss": 0.0264,
"step": 768
},
{
"epoch": 2.92,
"grad_norm": 0.28521269666590277,
"learning_rate": 7.764496688371929e-05,
"loss": 0.0279,
"step": 769
},
{
"epoch": 2.93,
"grad_norm": 0.24287300349001784,
"learning_rate": 7.740487134582525e-05,
"loss": 0.0182,
"step": 770
},
{
"epoch": 2.93,
"grad_norm": 0.3175933387720721,
"learning_rate": 7.716491298893442e-05,
"loss": 0.0339,
"step": 771
},
{
"epoch": 2.94,
"grad_norm": 0.32835585850076665,
"learning_rate": 7.692509326989753e-05,
"loss": 0.0264,
"step": 772
},
{
"epoch": 2.94,
"grad_norm": 0.3273636208785487,
"learning_rate": 7.668541364472346e-05,
"loss": 0.0371,
"step": 773
},
{
"epoch": 2.94,
"grad_norm": 0.32029214184531163,
"learning_rate": 7.644587556857065e-05,
"loss": 0.0266,
"step": 774
},
{
"epoch": 2.95,
"grad_norm": 0.3573218111505275,
"learning_rate": 7.620648049573815e-05,
"loss": 0.0294,
"step": 775
},
{
"epoch": 2.95,
"grad_norm": 0.2639798311874394,
"learning_rate": 7.596722987965669e-05,
"loss": 0.0245,
"step": 776
},
{
"epoch": 2.95,
"grad_norm": 0.3198924686801555,
"learning_rate": 7.572812517288018e-05,
"loss": 0.0222,
"step": 777
},
{
"epoch": 2.96,
"grad_norm": 0.3153205405405442,
"learning_rate": 7.548916782707642e-05,
"loss": 0.0322,
"step": 778
},
{
"epoch": 2.96,
"grad_norm": 0.26231028932152634,
"learning_rate": 7.525035929301877e-05,
"loss": 0.0302,
"step": 779
},
{
"epoch": 2.97,
"grad_norm": 0.2688781021142035,
"learning_rate": 7.50117010205769e-05,
"loss": 0.0251,
"step": 780
},
{
"epoch": 2.97,
"grad_norm": 0.26830771551596,
"learning_rate": 7.477319445870845e-05,
"loss": 0.0226,
"step": 781
},
{
"epoch": 2.97,
"grad_norm": 0.30176145649756575,
"learning_rate": 7.453484105544976e-05,
"loss": 0.0331,
"step": 782
},
{
"epoch": 2.98,
"grad_norm": 0.3115716482147434,
"learning_rate": 7.429664225790743e-05,
"loss": 0.0233,
"step": 783
},
{
"epoch": 2.98,
"grad_norm": 0.31100418685578723,
"learning_rate": 7.405859951224933e-05,
"loss": 0.0284,
"step": 784
},
{
"epoch": 2.98,
"grad_norm": 0.2588637124537901,
"learning_rate": 7.382071426369597e-05,
"loss": 0.0233,
"step": 785
},
{
"epoch": 2.99,
"grad_norm": 0.31963780634052935,
"learning_rate": 7.358298795651165e-05,
"loss": 0.0201,
"step": 786
},
{
"epoch": 2.99,
"grad_norm": 0.3357482742804347,
"learning_rate": 7.33454220339956e-05,
"loss": 0.0234,
"step": 787
},
{
"epoch": 3.0,
"grad_norm": 0.31241871424393247,
"learning_rate": 7.310801793847344e-05,
"loss": 0.0352,
"step": 788
},
{
"epoch": 3.0,
"grad_norm": 0.34133470506967756,
"learning_rate": 7.287077711128823e-05,
"loss": 0.0237,
"step": 789
},
{
"epoch": 3.0,
"grad_norm": 0.14462364414170686,
"learning_rate": 7.263370099279172e-05,
"loss": 0.0101,
"step": 790
},
{
"epoch": 3.01,
"grad_norm": 0.139685520904427,
"learning_rate": 7.239679102233582e-05,
"loss": 0.008,
"step": 791
},
{
"epoch": 3.01,
"grad_norm": 0.15075598570023135,
"learning_rate": 7.21600486382636e-05,
"loss": 0.009,
"step": 792
},
{
"epoch": 3.02,
"grad_norm": 0.09798572261973783,
"learning_rate": 7.192347527790073e-05,
"loss": 0.0047,
"step": 793
},
{
"epoch": 3.02,
"grad_norm": 0.17274723978571413,
"learning_rate": 7.168707237754658e-05,
"loss": 0.0101,
"step": 794
},
{
"epoch": 3.02,
"grad_norm": 0.14385963722120923,
"learning_rate": 7.14508413724658e-05,
"loss": 0.0087,
"step": 795
},
{
"epoch": 3.03,
"grad_norm": 0.1417300685940669,
"learning_rate": 7.121478369687926e-05,
"loss": 0.005,
"step": 796
},
{
"epoch": 3.03,
"grad_norm": 0.13023296035033566,
"learning_rate": 7.097890078395553e-05,
"loss": 0.0062,
"step": 797
},
{
"epoch": 3.03,
"grad_norm": 0.15199661343358178,
"learning_rate": 7.074319406580224e-05,
"loss": 0.0077,
"step": 798
},
{
"epoch": 3.04,
"grad_norm": 0.26597073732306437,
"learning_rate": 7.050766497345714e-05,
"loss": 0.0203,
"step": 799
},
{
"epoch": 3.04,
"grad_norm": 0.16538027691964055,
"learning_rate": 7.027231493687974e-05,
"loss": 0.0111,
"step": 800
},
{
"epoch": 3.04,
"eval_blimp_filtered_avg": 0.7185074626865672,
"eval_blimp_filtered_std": 0.005025927855759427,
"step": 800
},
{
"epoch": 3.04,
"eval_blimp_supplement_avg": 0.8297413793103449,
"eval_blimp_supplement_std": 0.016768829882349248,
"step": 800
},
{
"epoch": 3.04,
"eval_vqa_filtered_avg": 0.49,
"eval_vqa_filtered_std": 0.05024183937956912,
"step": 800
},
{
"epoch": 3.04,
"eval_winoground_filtered_avg": 0.67,
"eval_winoground_filtered_std": 0.04725815626252606,
"step": 800
},
{
"epoch": 3.05,
"grad_norm": 0.21979041950155864,
"learning_rate": 7.003714538494233e-05,
"loss": 0.0087,
"step": 801
},
{
"epoch": 3.05,
"grad_norm": 0.18913233160499004,
"learning_rate": 6.980215774542147e-05,
"loss": 0.0096,
"step": 802
},
{
"epoch": 3.05,
"grad_norm": 0.1282988164073535,
"learning_rate": 6.95673534449893e-05,
"loss": 0.0068,
"step": 803
},
{
"epoch": 3.06,
"grad_norm": 0.21752972603923165,
"learning_rate": 6.933273390920478e-05,
"loss": 0.0104,
"step": 804
},
{
"epoch": 3.06,
"grad_norm": 0.1967481497472081,
"learning_rate": 6.909830056250527e-05,
"loss": 0.01,
"step": 805
},
{
"epoch": 3.06,
"grad_norm": 0.18179145828619866,
"learning_rate": 6.886405482819756e-05,
"loss": 0.0093,
"step": 806
},
{
"epoch": 3.07,
"grad_norm": 0.17276847874184295,
"learning_rate": 6.862999812844953e-05,
"loss": 0.0092,
"step": 807
},
{
"epoch": 3.07,
"grad_norm": 0.1342024775407428,
"learning_rate": 6.839613188428126e-05,
"loss": 0.0065,
"step": 808
},
{
"epoch": 3.08,
"grad_norm": 0.14878102990268274,
"learning_rate": 6.81624575155566e-05,
"loss": 0.0063,
"step": 809
},
{
"epoch": 3.08,
"grad_norm": 0.16403017563876973,
"learning_rate": 6.792897644097451e-05,
"loss": 0.0087,
"step": 810
},
{
"epoch": 3.08,
"grad_norm": 0.19801520787974072,
"learning_rate": 6.769569007806027e-05,
"loss": 0.0093,
"step": 811
},
{
"epoch": 3.09,
"grad_norm": 0.20572707361042517,
"learning_rate": 6.746259984315717e-05,
"loss": 0.011,
"step": 812
},
{
"epoch": 3.09,
"grad_norm": 0.13697981744457283,
"learning_rate": 6.722970715141763e-05,
"loss": 0.0046,
"step": 813
},
{
"epoch": 3.1,
"grad_norm": 0.19205653189851232,
"learning_rate": 6.699701341679488e-05,
"loss": 0.0092,
"step": 814
},
{
"epoch": 3.1,
"grad_norm": 0.1918525797269716,
"learning_rate": 6.676452005203406e-05,
"loss": 0.0098,
"step": 815
},
{
"epoch": 3.1,
"grad_norm": 0.2248320215810751,
"learning_rate": 6.653222846866389e-05,
"loss": 0.0109,
"step": 816
},
{
"epoch": 3.11,
"grad_norm": 0.15114455523839437,
"learning_rate": 6.630014007698807e-05,
"loss": 0.0067,
"step": 817
},
{
"epoch": 3.11,
"grad_norm": 0.13233475079909107,
"learning_rate": 6.606825628607654e-05,
"loss": 0.0071,
"step": 818
},
{
"epoch": 3.11,
"grad_norm": 0.13146130811845824,
"learning_rate": 6.583657850375723e-05,
"loss": 0.0059,
"step": 819
},
{
"epoch": 3.12,
"grad_norm": 0.1857326870966165,
"learning_rate": 6.560510813660719e-05,
"loss": 0.0036,
"step": 820
},
{
"epoch": 3.12,
"grad_norm": 0.2298298643113706,
"learning_rate": 6.537384658994428e-05,
"loss": 0.0086,
"step": 821
},
{
"epoch": 3.13,
"grad_norm": 0.14279703425417056,
"learning_rate": 6.51427952678185e-05,
"loss": 0.0085,
"step": 822
},
{
"epoch": 3.13,
"grad_norm": 0.10832021907148229,
"learning_rate": 6.491195557300353e-05,
"loss": 0.0058,
"step": 823
},
{
"epoch": 3.13,
"grad_norm": 0.11823319091468878,
"learning_rate": 6.468132890698829e-05,
"loss": 0.0062,
"step": 824
},
{
"epoch": 3.14,
"grad_norm": 0.18897801838669076,
"learning_rate": 6.44509166699682e-05,
"loss": 0.0092,
"step": 825
},
{
"epoch": 3.14,
"grad_norm": 0.25020039121328774,
"learning_rate": 6.422072026083697e-05,
"loss": 0.0137,
"step": 826
},
{
"epoch": 3.14,
"grad_norm": 0.20775807254077025,
"learning_rate": 6.399074107717782e-05,
"loss": 0.0093,
"step": 827
},
{
"epoch": 3.15,
"grad_norm": 0.20035420751641417,
"learning_rate": 6.376098051525529e-05,
"loss": 0.008,
"step": 828
},
{
"epoch": 3.15,
"grad_norm": 0.1813508652414028,
"learning_rate": 6.35314399700065e-05,
"loss": 0.0046,
"step": 829
},
{
"epoch": 3.16,
"grad_norm": 0.17007027062456834,
"learning_rate": 6.33021208350328e-05,
"loss": 0.0073,
"step": 830
},
{
"epoch": 3.16,
"grad_norm": 0.19015476370218232,
"learning_rate": 6.307302450259136e-05,
"loss": 0.0106,
"step": 831
},
{
"epoch": 3.16,
"grad_norm": 0.16589400476299693,
"learning_rate": 6.284415236358653e-05,
"loss": 0.0055,
"step": 832
},
{
"epoch": 3.17,
"grad_norm": 0.16209318707015577,
"learning_rate": 6.261550580756175e-05,
"loss": 0.005,
"step": 833
},
{
"epoch": 3.17,
"grad_norm": 0.18839393523384632,
"learning_rate": 6.238708622269065e-05,
"loss": 0.0089,
"step": 834
},
{
"epoch": 3.17,
"grad_norm": 0.16926357657366423,
"learning_rate": 6.215889499576898e-05,
"loss": 0.0071,
"step": 835
},
{
"epoch": 3.18,
"grad_norm": 0.1427288948192287,
"learning_rate": 6.193093351220605e-05,
"loss": 0.0071,
"step": 836
},
{
"epoch": 3.18,
"grad_norm": 0.0823573599543454,
"learning_rate": 6.170320315601628e-05,
"loss": 0.0028,
"step": 837
},
{
"epoch": 3.19,
"grad_norm": 0.17793420603102786,
"learning_rate": 6.147570530981099e-05,
"loss": 0.0073,
"step": 838
},
{
"epoch": 3.19,
"grad_norm": 0.2613656409512038,
"learning_rate": 6.12484413547897e-05,
"loss": 0.0099,
"step": 839
},
{
"epoch": 3.19,
"grad_norm": 0.10748497227166323,
"learning_rate": 6.102141267073207e-05,
"loss": 0.0056,
"step": 840
},
{
"epoch": 3.2,
"grad_norm": 0.3115342005600293,
"learning_rate": 6.0794620635989244e-05,
"loss": 0.0202,
"step": 841
},
{
"epoch": 3.2,
"grad_norm": 0.17604863819079872,
"learning_rate": 6.056806662747572e-05,
"loss": 0.0058,
"step": 842
},
{
"epoch": 3.21,
"grad_norm": 0.09017231755285983,
"learning_rate": 6.034175202066077e-05,
"loss": 0.0031,
"step": 843
},
{
"epoch": 3.21,
"grad_norm": 0.15807640643888599,
"learning_rate": 6.011567818956021e-05,
"loss": 0.0083,
"step": 844
},
{
"epoch": 3.21,
"grad_norm": 0.1322797473736452,
"learning_rate": 5.988984650672813e-05,
"loss": 0.0054,
"step": 845
},
{
"epoch": 3.22,
"grad_norm": 0.1646419558157819,
"learning_rate": 5.96642583432484e-05,
"loss": 0.0088,
"step": 846
},
{
"epoch": 3.22,
"grad_norm": 0.15547257435043996,
"learning_rate": 5.943891506872645e-05,
"loss": 0.0074,
"step": 847
},
{
"epoch": 3.22,
"grad_norm": 0.19268286944193538,
"learning_rate": 5.921381805128088e-05,
"loss": 0.0125,
"step": 848
},
{
"epoch": 3.23,
"grad_norm": 0.16001416685280997,
"learning_rate": 5.898896865753522e-05,
"loss": 0.0102,
"step": 849
},
{
"epoch": 3.23,
"grad_norm": 0.14560946392731133,
"learning_rate": 5.876436825260967e-05,
"loss": 0.0058,
"step": 850
},
{
"epoch": 3.24,
"grad_norm": 0.10597553502655095,
"learning_rate": 5.854001820011265e-05,
"loss": 0.0042,
"step": 851
},
{
"epoch": 3.24,
"grad_norm": 0.1477597061504793,
"learning_rate": 5.831591986213274e-05,
"loss": 0.0115,
"step": 852
},
{
"epoch": 3.24,
"grad_norm": 0.11589346222710697,
"learning_rate": 5.809207459923016e-05,
"loss": 0.0052,
"step": 853
},
{
"epoch": 3.25,
"grad_norm": 0.11233318637657551,
"learning_rate": 5.786848377042875e-05,
"loss": 0.0047,
"step": 854
},
{
"epoch": 3.25,
"grad_norm": 0.14038827344355653,
"learning_rate": 5.764514873320761e-05,
"loss": 0.0056,
"step": 855
},
{
"epoch": 3.25,
"grad_norm": 0.13670003396305133,
"learning_rate": 5.7422070843492734e-05,
"loss": 0.007,
"step": 856
},
{
"epoch": 3.26,
"grad_norm": 0.10138719221994133,
"learning_rate": 5.719925145564913e-05,
"loss": 0.005,
"step": 857
},
{
"epoch": 3.26,
"grad_norm": 0.10712806624372671,
"learning_rate": 5.697669192247215e-05,
"loss": 0.0032,
"step": 858
},
{
"epoch": 3.27,
"grad_norm": 0.2867594004405392,
"learning_rate": 5.675439359517962e-05,
"loss": 0.007,
"step": 859
},
{
"epoch": 3.27,
"grad_norm": 0.15537957888603157,
"learning_rate": 5.6532357823403517e-05,
"loss": 0.0059,
"step": 860
},
{
"epoch": 3.27,
"grad_norm": 0.13908151142988748,
"learning_rate": 5.63105859551817e-05,
"loss": 0.0055,
"step": 861
},
{
"epoch": 3.28,
"grad_norm": 0.1670760871245682,
"learning_rate": 5.608907933694994e-05,
"loss": 0.007,
"step": 862
},
{
"epoch": 3.28,
"grad_norm": 0.15128774900728617,
"learning_rate": 5.586783931353338e-05,
"loss": 0.0071,
"step": 863
},
{
"epoch": 3.29,
"grad_norm": 0.118351563993401,
"learning_rate": 5.56468672281388e-05,
"loss": 0.0052,
"step": 864
},
{
"epoch": 3.29,
"grad_norm": 0.26632088069900783,
"learning_rate": 5.542616442234618e-05,
"loss": 0.0096,
"step": 865
},
{
"epoch": 3.29,
"grad_norm": 0.15185019444874828,
"learning_rate": 5.5205732236100635e-05,
"loss": 0.0065,
"step": 866
},
{
"epoch": 3.3,
"grad_norm": 0.1448611480489451,
"learning_rate": 5.498557200770429e-05,
"loss": 0.0065,
"step": 867
},
{
"epoch": 3.3,
"grad_norm": 0.16431150027248104,
"learning_rate": 5.476568507380815e-05,
"loss": 0.0066,
"step": 868
},
{
"epoch": 3.3,
"grad_norm": 0.11908895770673851,
"learning_rate": 5.454607276940389e-05,
"loss": 0.0032,
"step": 869
},
{
"epoch": 3.31,
"grad_norm": 0.1443718220854692,
"learning_rate": 5.4326736427815946e-05,
"loss": 0.0053,
"step": 870
},
{
"epoch": 3.31,
"grad_norm": 0.14871442319667327,
"learning_rate": 5.410767738069328e-05,
"loss": 0.0059,
"step": 871
},
{
"epoch": 3.32,
"grad_norm": 0.18082767972242653,
"learning_rate": 5.388889695800129e-05,
"loss": 0.0068,
"step": 872
},
{
"epoch": 3.32,
"grad_norm": 0.1648249037129662,
"learning_rate": 5.3670396488013854e-05,
"loss": 0.0067,
"step": 873
},
{
"epoch": 3.32,
"grad_norm": 0.12517423729992644,
"learning_rate": 5.345217729730501e-05,
"loss": 0.0048,
"step": 874
},
{
"epoch": 3.33,
"grad_norm": 0.1464921521793365,
"learning_rate": 5.3234240710741337e-05,
"loss": 0.0068,
"step": 875
},
{
"epoch": 3.33,
"grad_norm": 0.17291303731043475,
"learning_rate": 5.301658805147338e-05,
"loss": 0.0096,
"step": 876
},
{
"epoch": 3.33,
"grad_norm": 0.1280298983910598,
"learning_rate": 5.279922064092808e-05,
"loss": 0.0034,
"step": 877
},
{
"epoch": 3.34,
"grad_norm": 0.12917298155423965,
"learning_rate": 5.25821397988005e-05,
"loss": 0.0043,
"step": 878
},
{
"epoch": 3.34,
"grad_norm": 0.16991548955604624,
"learning_rate": 5.236534684304575e-05,
"loss": 0.0036,
"step": 879
},
{
"epoch": 3.35,
"grad_norm": 0.1341181151945374,
"learning_rate": 5.214884308987136e-05,
"loss": 0.0046,
"step": 880
},
{
"epoch": 3.35,
"grad_norm": 0.07618404508173873,
"learning_rate": 5.193262985372879e-05,
"loss": 0.0027,
"step": 881
},
{
"epoch": 3.35,
"grad_norm": 0.08880977813478996,
"learning_rate": 5.171670844730581e-05,
"loss": 0.0036,
"step": 882
},
{
"epoch": 3.36,
"grad_norm": 0.13216208460528805,
"learning_rate": 5.150108018151845e-05,
"loss": 0.0042,
"step": 883
},
{
"epoch": 3.36,
"grad_norm": 0.11790812970039133,
"learning_rate": 5.128574636550283e-05,
"loss": 0.0035,
"step": 884
},
{
"epoch": 3.37,
"grad_norm": 0.11136507705857467,
"learning_rate": 5.107070830660765e-05,
"loss": 0.0032,
"step": 885
},
{
"epoch": 3.37,
"grad_norm": 0.1625804231101919,
"learning_rate": 5.0855967310385776e-05,
"loss": 0.0035,
"step": 886
},
{
"epoch": 3.37,
"grad_norm": 0.16854543758219576,
"learning_rate": 5.064152468058661e-05,
"loss": 0.013,
"step": 887
},
{
"epoch": 3.38,
"grad_norm": 0.1904583816627798,
"learning_rate": 5.0427381719148115e-05,
"loss": 0.0058,
"step": 888
},
{
"epoch": 3.38,
"grad_norm": 0.1407250892066329,
"learning_rate": 5.021353972618877e-05,
"loss": 0.0048,
"step": 889
},
{
"epoch": 3.38,
"grad_norm": 0.09332762885645429,
"learning_rate": 5.000000000000002e-05,
"loss": 0.0033,
"step": 890
},
{
"epoch": 3.39,
"grad_norm": 0.1282598573789377,
"learning_rate": 4.978676383703792e-05,
"loss": 0.0051,
"step": 891
},
{
"epoch": 3.39,
"grad_norm": 0.12527309494209085,
"learning_rate": 4.957383253191567e-05,
"loss": 0.0039,
"step": 892
},
{
"epoch": 3.4,
"grad_norm": 0.1474717370503847,
"learning_rate": 4.9361207377395526e-05,
"loss": 0.0056,
"step": 893
},
{
"epoch": 3.4,
"grad_norm": 0.17216191154710148,
"learning_rate": 4.914888966438107e-05,
"loss": 0.0061,
"step": 894
},
{
"epoch": 3.4,
"grad_norm": 0.11969436074946377,
"learning_rate": 4.893688068190932e-05,
"loss": 0.0045,
"step": 895
},
{
"epoch": 3.41,
"grad_norm": 0.16780413116289464,
"learning_rate": 4.872518171714285e-05,
"loss": 0.0067,
"step": 896
},
{
"epoch": 3.41,
"grad_norm": 0.09403083909006464,
"learning_rate": 4.8513794055362094e-05,
"loss": 0.0033,
"step": 897
},
{
"epoch": 3.41,
"grad_norm": 0.09334786241295161,
"learning_rate": 4.8302718979957465e-05,
"loss": 0.0025,
"step": 898
},
{
"epoch": 3.42,
"grad_norm": 0.12070714502867205,
"learning_rate": 4.809195777242157e-05,
"loss": 0.0044,
"step": 899
},
{
"epoch": 3.42,
"grad_norm": 0.0724744178923367,
"learning_rate": 4.7881511712341484e-05,
"loss": 0.0029,
"step": 900
},
{
"epoch": 3.42,
"eval_blimp_filtered_avg": 0.7164179104477612,
"eval_blimp_filtered_std": 0.0050468818783488715,
"step": 900
},
{
"epoch": 3.42,
"eval_blimp_supplement_avg": 0.8318965517241379,
"eval_blimp_supplement_std": 0.016663496994065188,
"step": 900
},
{
"epoch": 3.42,
"eval_vqa_filtered_avg": 0.48,
"eval_vqa_filtered_std": 0.05021167315686779,
"step": 900
},
{
"epoch": 3.42,
"eval_winoground_filtered_avg": 0.68,
"eval_winoground_filtered_std": 0.046882617226215034,
"step": 900
},
{
"epoch": 3.43,
"grad_norm": 0.12264611037048552,
"learning_rate": 4.7671382077390923e-05,
"loss": 0.0049,
"step": 901
},
{
"epoch": 3.43,
"grad_norm": 0.1447159721309767,
"learning_rate": 4.746157014332242e-05,
"loss": 0.0044,
"step": 902
},
{
"epoch": 3.43,
"grad_norm": 0.16375949938502024,
"learning_rate": 4.7252077183959766e-05,
"loss": 0.0065,
"step": 903
},
{
"epoch": 3.44,
"grad_norm": 0.10516324597971437,
"learning_rate": 4.704290447119013e-05,
"loss": 0.0043,
"step": 904
},
{
"epoch": 3.44,
"grad_norm": 0.08499567671503878,
"learning_rate": 4.683405327495638e-05,
"loss": 0.0033,
"step": 905
},
{
"epoch": 3.44,
"grad_norm": 0.24460270399320144,
"learning_rate": 4.6625524863249435e-05,
"loss": 0.0073,
"step": 906
},
{
"epoch": 3.45,
"grad_norm": 0.12925387617205644,
"learning_rate": 4.6417320502100316e-05,
"loss": 0.0044,
"step": 907
},
{
"epoch": 3.45,
"grad_norm": 0.09078563973748639,
"learning_rate": 4.6209441455572934e-05,
"loss": 0.0024,
"step": 908
},
{
"epoch": 3.46,
"grad_norm": 0.15653536427369144,
"learning_rate": 4.600188898575585e-05,
"loss": 0.0069,
"step": 909
},
{
"epoch": 3.46,
"grad_norm": 0.2617481572374967,
"learning_rate": 4.5794664352755055e-05,
"loss": 0.0078,
"step": 910
},
{
"epoch": 3.46,
"grad_norm": 0.16686255253211194,
"learning_rate": 4.558776881468616e-05,
"loss": 0.0063,
"step": 911
},
{
"epoch": 3.47,
"grad_norm": 0.14235218284923637,
"learning_rate": 4.538120362766659e-05,
"loss": 0.0043,
"step": 912
},
{
"epoch": 3.47,
"grad_norm": 0.2061352593588118,
"learning_rate": 4.5174970045808373e-05,
"loss": 0.0083,
"step": 913
},
{
"epoch": 3.48,
"grad_norm": 0.09141171412747422,
"learning_rate": 4.496906932121006e-05,
"loss": 0.0038,
"step": 914
},
{
"epoch": 3.48,
"grad_norm": 0.14581164354276802,
"learning_rate": 4.476350270394942e-05,
"loss": 0.0054,
"step": 915
},
{
"epoch": 3.48,
"grad_norm": 0.1909792639108279,
"learning_rate": 4.4558271442075817e-05,
"loss": 0.0058,
"step": 916
},
{
"epoch": 3.49,
"grad_norm": 0.16782053974713387,
"learning_rate": 4.435337678160244e-05,
"loss": 0.0053,
"step": 917
},
{
"epoch": 3.49,
"grad_norm": 0.1679302657777338,
"learning_rate": 4.414881996649909e-05,
"loss": 0.006,
"step": 918
},
{
"epoch": 3.49,
"grad_norm": 0.11555008314937329,
"learning_rate": 4.394460223868422e-05,
"loss": 0.0038,
"step": 919
},
{
"epoch": 3.5,
"grad_norm": 0.13987569584452733,
"learning_rate": 4.374072483801769e-05,
"loss": 0.0057,
"step": 920
},
{
"epoch": 3.5,
"grad_norm": 0.15472739669003643,
"learning_rate": 4.353718900229315e-05,
"loss": 0.0047,
"step": 921
},
{
"epoch": 3.51,
"grad_norm": 0.11860213300146173,
"learning_rate": 4.333399596723054e-05,
"loss": 0.0041,
"step": 922
},
{
"epoch": 3.51,
"grad_norm": 0.12771536456303176,
"learning_rate": 4.313114696646844e-05,
"loss": 0.0051,
"step": 923
},
{
"epoch": 3.51,
"grad_norm": 0.1786915439582824,
"learning_rate": 4.2928643231556844e-05,
"loss": 0.0059,
"step": 924
},
{
"epoch": 3.52,
"grad_norm": 0.24120215565132727,
"learning_rate": 4.272648599194948e-05,
"loss": 0.0046,
"step": 925
},
{
"epoch": 3.52,
"grad_norm": 0.14109245430082046,
"learning_rate": 4.2524676474996436e-05,
"loss": 0.0045,
"step": 926
},
{
"epoch": 3.52,
"grad_norm": 0.08773485394717533,
"learning_rate": 4.232321590593672e-05,
"loss": 0.0026,
"step": 927
},
{
"epoch": 3.53,
"grad_norm": 0.11777262879064121,
"learning_rate": 4.212210550789066e-05,
"loss": 0.0039,
"step": 928
},
{
"epoch": 3.53,
"grad_norm": 0.15303150859624717,
"learning_rate": 4.192134650185271e-05,
"loss": 0.0049,
"step": 929
},
{
"epoch": 3.54,
"grad_norm": 0.17847021642871766,
"learning_rate": 4.172094010668391e-05,
"loss": 0.0122,
"step": 930
},
{
"epoch": 3.54,
"grad_norm": 0.12872303754581352,
"learning_rate": 4.1520887539104516e-05,
"loss": 0.0034,
"step": 931
},
{
"epoch": 3.54,
"grad_norm": 0.09422413881251451,
"learning_rate": 4.132119001368658e-05,
"loss": 0.0032,
"step": 932
},
{
"epoch": 3.55,
"grad_norm": 0.11765015102315801,
"learning_rate": 4.112184874284655e-05,
"loss": 0.004,
"step": 933
},
{
"epoch": 3.55,
"grad_norm": 0.1816646133957582,
"learning_rate": 4.092286493683812e-05,
"loss": 0.0054,
"step": 934
},
{
"epoch": 3.56,
"grad_norm": 0.08821639897774167,
"learning_rate": 4.072423980374452e-05,
"loss": 0.0027,
"step": 935
},
{
"epoch": 3.56,
"grad_norm": 0.109204348627212,
"learning_rate": 4.052597454947151e-05,
"loss": 0.0037,
"step": 936
},
{
"epoch": 3.56,
"grad_norm": 0.07762991091541192,
"learning_rate": 4.0328070377739936e-05,
"loss": 0.0048,
"step": 937
},
{
"epoch": 3.57,
"grad_norm": 0.12303959337300706,
"learning_rate": 4.0130528490078255e-05,
"loss": 0.0054,
"step": 938
},
{
"epoch": 3.57,
"grad_norm": 0.14938977746099588,
"learning_rate": 3.993335008581569e-05,
"loss": 0.0054,
"step": 939
},
{
"epoch": 3.57,
"grad_norm": 0.10793183299648912,
"learning_rate": 3.973653636207437e-05,
"loss": 0.0051,
"step": 940
},
{
"epoch": 3.58,
"grad_norm": 0.07679917764318481,
"learning_rate": 3.954008851376252e-05,
"loss": 0.0021,
"step": 941
},
{
"epoch": 3.58,
"grad_norm": 0.20216658828207906,
"learning_rate": 3.934400773356702e-05,
"loss": 0.0082,
"step": 942
},
{
"epoch": 3.59,
"grad_norm": 0.17081936190481015,
"learning_rate": 3.914829521194606e-05,
"loss": 0.0047,
"step": 943
},
{
"epoch": 3.59,
"grad_norm": 0.1681844318722247,
"learning_rate": 3.895295213712227e-05,
"loss": 0.0053,
"step": 944
},
{
"epoch": 3.59,
"grad_norm": 0.11660462512191513,
"learning_rate": 3.875797969507502e-05,
"loss": 0.0068,
"step": 945
},
{
"epoch": 3.6,
"grad_norm": 0.10899837547854943,
"learning_rate": 3.8563379069533626e-05,
"loss": 0.004,
"step": 946
},
{
"epoch": 3.6,
"grad_norm": 0.11737468807394794,
"learning_rate": 3.836915144196995e-05,
"loss": 0.0043,
"step": 947
},
{
"epoch": 3.6,
"grad_norm": 0.17750707113239161,
"learning_rate": 3.8175297991591316e-05,
"loss": 0.0059,
"step": 948
},
{
"epoch": 3.61,
"grad_norm": 0.08434750541376605,
"learning_rate": 3.7981819895333336e-05,
"loss": 0.0028,
"step": 949
},
{
"epoch": 3.61,
"grad_norm": 0.08977864719318272,
"learning_rate": 3.778871832785262e-05,
"loss": 0.0025,
"step": 950
},
{
"epoch": 3.62,
"grad_norm": 0.26117422627907305,
"learning_rate": 3.759599446151994e-05,
"loss": 0.0058,
"step": 951
},
{
"epoch": 3.62,
"grad_norm": 0.1423125205304331,
"learning_rate": 3.740364946641284e-05,
"loss": 0.005,
"step": 952
},
{
"epoch": 3.62,
"grad_norm": 0.048844504093019145,
"learning_rate": 3.721168451030868e-05,
"loss": 0.0013,
"step": 953
},
{
"epoch": 3.63,
"grad_norm": 0.12750445172306626,
"learning_rate": 3.702010075867748e-05,
"loss": 0.0039,
"step": 954
},
{
"epoch": 3.63,
"grad_norm": 0.1363488868484316,
"learning_rate": 3.682889937467493e-05,
"loss": 0.0054,
"step": 955
},
{
"epoch": 3.63,
"grad_norm": 0.10861963297037971,
"learning_rate": 3.6638081519135115e-05,
"loss": 0.0025,
"step": 956
},
{
"epoch": 3.64,
"grad_norm": 0.07423146974751119,
"learning_rate": 3.6447648350563767e-05,
"loss": 0.002,
"step": 957
},
{
"epoch": 3.64,
"grad_norm": 0.1473106789075466,
"learning_rate": 3.6257601025131026e-05,
"loss": 0.0048,
"step": 958
},
{
"epoch": 3.65,
"grad_norm": 0.20756086874083257,
"learning_rate": 3.6067940696664484e-05,
"loss": 0.0044,
"step": 959
},
{
"epoch": 3.65,
"grad_norm": 0.12432385236166127,
"learning_rate": 3.587866851664219e-05,
"loss": 0.0042,
"step": 960
},
{
"epoch": 3.65,
"grad_norm": 0.1555267425126639,
"learning_rate": 3.568978563418551e-05,
"loss": 0.004,
"step": 961
},
{
"epoch": 3.66,
"grad_norm": 0.10864709636610591,
"learning_rate": 3.5501293196052544e-05,
"loss": 0.003,
"step": 962
},
{
"epoch": 3.66,
"grad_norm": 0.10844528424446415,
"learning_rate": 3.531319234663063e-05,
"loss": 0.003,
"step": 963
},
{
"epoch": 3.67,
"grad_norm": 0.18756992146351528,
"learning_rate": 3.512548422792983e-05,
"loss": 0.0039,
"step": 964
},
{
"epoch": 3.67,
"grad_norm": 0.16307530028534215,
"learning_rate": 3.493816997957582e-05,
"loss": 0.0036,
"step": 965
},
{
"epoch": 3.67,
"grad_norm": 0.21835831614587714,
"learning_rate": 3.4751250738802835e-05,
"loss": 0.0057,
"step": 966
},
{
"epoch": 3.68,
"grad_norm": 0.12509619886030035,
"learning_rate": 3.456472764044718e-05,
"loss": 0.0037,
"step": 967
},
{
"epoch": 3.68,
"grad_norm": 0.1878620997227208,
"learning_rate": 3.4378601816939824e-05,
"loss": 0.0044,
"step": 968
},
{
"epoch": 3.68,
"grad_norm": 0.20067996408678637,
"learning_rate": 3.4192874398299915e-05,
"loss": 0.0074,
"step": 969
},
{
"epoch": 3.69,
"grad_norm": 0.09941229298151419,
"learning_rate": 3.400754651212776e-05,
"loss": 0.0038,
"step": 970
},
{
"epoch": 3.69,
"grad_norm": 0.13212089570245356,
"learning_rate": 3.382261928359791e-05,
"loss": 0.0047,
"step": 971
},
{
"epoch": 3.7,
"grad_norm": 0.08933394980267556,
"learning_rate": 3.36380938354526e-05,
"loss": 0.0039,
"step": 972
},
{
"epoch": 3.7,
"grad_norm": 0.16729919593124423,
"learning_rate": 3.3453971287994545e-05,
"loss": 0.0079,
"step": 973
},
{
"epoch": 3.7,
"grad_norm": 0.15697707244755352,
"learning_rate": 3.3270252759080476e-05,
"loss": 0.0052,
"step": 974
},
{
"epoch": 3.71,
"grad_norm": 0.09194443755715154,
"learning_rate": 3.308693936411421e-05,
"loss": 0.0024,
"step": 975
},
{
"epoch": 3.71,
"grad_norm": 0.04543214336381121,
"learning_rate": 3.290403221603976e-05,
"loss": 0.0014,
"step": 976
},
{
"epoch": 3.71,
"grad_norm": 0.13598936920247842,
"learning_rate": 3.2721532425334934e-05,
"loss": 0.003,
"step": 977
},
{
"epoch": 3.72,
"grad_norm": 0.12519394602390674,
"learning_rate": 3.253944110000415e-05,
"loss": 0.0044,
"step": 978
},
{
"epoch": 3.72,
"grad_norm": 0.10159326068686843,
"learning_rate": 3.235775934557204e-05,
"loss": 0.0037,
"step": 979
},
{
"epoch": 3.73,
"grad_norm": 0.09206667100999626,
"learning_rate": 3.2176488265076596e-05,
"loss": 0.0023,
"step": 980
},
{
"epoch": 3.73,
"grad_norm": 0.13180534984976824,
"learning_rate": 3.199562895906252e-05,
"loss": 0.0059,
"step": 981
},
{
"epoch": 3.73,
"grad_norm": 0.1164365775293987,
"learning_rate": 3.1815182525574495e-05,
"loss": 0.0039,
"step": 982
},
{
"epoch": 3.74,
"grad_norm": 0.09037223834103687,
"learning_rate": 3.163515006015052e-05,
"loss": 0.0031,
"step": 983
},
{
"epoch": 3.74,
"grad_norm": 0.1243136017465312,
"learning_rate": 3.1455532655815346e-05,
"loss": 0.0023,
"step": 984
},
{
"epoch": 3.75,
"grad_norm": 0.06736081989652704,
"learning_rate": 3.1276331403073735e-05,
"loss": 0.0018,
"step": 985
},
{
"epoch": 3.75,
"grad_norm": 0.17307715618033684,
"learning_rate": 3.10975473899039e-05,
"loss": 0.005,
"step": 986
},
{
"epoch": 3.75,
"grad_norm": 0.06048816505854376,
"learning_rate": 3.09191817017509e-05,
"loss": 0.0021,
"step": 987
},
{
"epoch": 3.76,
"grad_norm": 0.12504484352987397,
"learning_rate": 3.074123542152001e-05,
"loss": 0.0032,
"step": 988
},
{
"epoch": 3.76,
"grad_norm": 0.06833118623733686,
"learning_rate": 3.056370962957014e-05,
"loss": 0.0021,
"step": 989
},
{
"epoch": 3.76,
"grad_norm": 0.1108735830960286,
"learning_rate": 3.0386605403707346e-05,
"loss": 0.0034,
"step": 990
},
{
"epoch": 3.77,
"grad_norm": 0.09939702792131645,
"learning_rate": 3.020992381917823e-05,
"loss": 0.0029,
"step": 991
},
{
"epoch": 3.77,
"grad_norm": 0.0991719206432376,
"learning_rate": 3.0033665948663448e-05,
"loss": 0.0033,
"step": 992
},
{
"epoch": 3.78,
"grad_norm": 0.12662565643384302,
"learning_rate": 2.9857832862271183e-05,
"loss": 0.004,
"step": 993
},
{
"epoch": 3.78,
"grad_norm": 0.20781158845550377,
"learning_rate": 2.968242562753051e-05,
"loss": 0.0058,
"step": 994
},
{
"epoch": 3.78,
"grad_norm": 0.14694222050004627,
"learning_rate": 2.9507445309385294e-05,
"loss": 0.0115,
"step": 995
},
{
"epoch": 3.79,
"grad_norm": 0.10808293020495671,
"learning_rate": 2.9332892970187255e-05,
"loss": 0.0043,
"step": 996
},
{
"epoch": 3.79,
"grad_norm": 0.12048492965543693,
"learning_rate": 2.915876966968978e-05,
"loss": 0.0029,
"step": 997
},
{
"epoch": 3.79,
"grad_norm": 0.09225586697033628,
"learning_rate": 2.8985076465041582e-05,
"loss": 0.0023,
"step": 998
},
{
"epoch": 3.8,
"grad_norm": 0.16415643694268406,
"learning_rate": 2.8811814410779957e-05,
"loss": 0.0055,
"step": 999
},
{
"epoch": 3.8,
"grad_norm": 0.11278583943799678,
"learning_rate": 2.8638984558824777e-05,
"loss": 0.0039,
"step": 1000
},
{
"epoch": 3.8,
"eval_blimp_filtered_avg": 0.716865671641791,
"eval_blimp_filtered_std": 0.00502778551816782,
"step": 1000
},
{
"epoch": 3.8,
"eval_blimp_supplement_avg": 0.8362068965517241,
"eval_blimp_supplement_std": 0.016430816592740906,
"step": 1000
},
{
"epoch": 3.8,
"eval_vqa_filtered_avg": 0.48,
"eval_vqa_filtered_std": 0.05021167315686779,
"step": 1000
},
{
"epoch": 3.8,
"eval_winoground_filtered_avg": 0.7,
"eval_winoground_filtered_std": 0.046056618647183814,
"step": 1000
},
{
"epoch": 3.81,
"grad_norm": 0.19790160401886048,
"learning_rate": 2.8466587958471713e-05,
"loss": 0.0079,
"step": 1001
},
{
"epoch": 3.81,
"grad_norm": 0.15241865463221207,
"learning_rate": 2.8294625656386153e-05,
"loss": 0.0033,
"step": 1002
},
{
"epoch": 3.81,
"grad_norm": 0.15486582212720096,
"learning_rate": 2.812309869659675e-05,
"loss": 0.0049,
"step": 1003
},
{
"epoch": 3.82,
"grad_norm": 0.1403453911129104,
"learning_rate": 2.7952008120489005e-05,
"loss": 0.0035,
"step": 1004
},
{
"epoch": 3.82,
"grad_norm": 0.2066076788884496,
"learning_rate": 2.7781354966799078e-05,
"loss": 0.0069,
"step": 1005
},
{
"epoch": 3.83,
"grad_norm": 0.14517188108322096,
"learning_rate": 2.7611140271607417e-05,
"loss": 0.0038,
"step": 1006
},
{
"epoch": 3.83,
"grad_norm": 0.19311241295438267,
"learning_rate": 2.744136506833247e-05,
"loss": 0.0046,
"step": 1007
},
{
"epoch": 3.83,
"grad_norm": 0.15261908108180047,
"learning_rate": 2.7272030387724423e-05,
"loss": 0.0051,
"step": 1008
},
{
"epoch": 3.84,
"grad_norm": 0.148487491757181,
"learning_rate": 2.7103137257858868e-05,
"loss": 0.0047,
"step": 1009
},
{
"epoch": 3.84,
"grad_norm": 0.2545086755837555,
"learning_rate": 2.6934686704130696e-05,
"loss": 0.0078,
"step": 1010
},
{
"epoch": 3.84,
"grad_norm": 0.2155734204394312,
"learning_rate": 2.6766679749247793e-05,
"loss": 0.0053,
"step": 1011
},
{
"epoch": 3.85,
"grad_norm": 0.11633424479852102,
"learning_rate": 2.6599117413224817e-05,
"loss": 0.0026,
"step": 1012
},
{
"epoch": 3.85,
"grad_norm": 0.16185230460669028,
"learning_rate": 2.6432000713377027e-05,
"loss": 0.006,
"step": 1013
},
{
"epoch": 3.86,
"grad_norm": 0.06116976817258535,
"learning_rate": 2.6265330664314157e-05,
"loss": 0.002,
"step": 1014
},
{
"epoch": 3.86,
"grad_norm": 0.1467173492208502,
"learning_rate": 2.6099108277934103e-05,
"loss": 0.0045,
"step": 1015
},
{
"epoch": 3.86,
"grad_norm": 0.06626573294126052,
"learning_rate": 2.5933334563416976e-05,
"loss": 0.002,
"step": 1016
},
{
"epoch": 3.87,
"grad_norm": 0.12063079499322236,
"learning_rate": 2.5768010527218845e-05,
"loss": 0.0027,
"step": 1017
},
{
"epoch": 3.87,
"grad_norm": 0.10686596344627741,
"learning_rate": 2.5603137173065674e-05,
"loss": 0.0043,
"step": 1018
},
{
"epoch": 3.87,
"grad_norm": 0.11272599358320497,
"learning_rate": 2.543871550194723e-05,
"loss": 0.0027,
"step": 1019
},
{
"epoch": 3.88,
"grad_norm": 0.1718157414634651,
"learning_rate": 2.527474651211089e-05,
"loss": 0.0041,
"step": 1020
},
{
"epoch": 3.88,
"grad_norm": 0.11357917775880114,
"learning_rate": 2.5111231199055896e-05,
"loss": 0.0038,
"step": 1021
},
{
"epoch": 3.89,
"grad_norm": 0.07702764510610083,
"learning_rate": 2.494817055552686e-05,
"loss": 0.0025,
"step": 1022
},
{
"epoch": 3.89,
"grad_norm": 0.1682497107538642,
"learning_rate": 2.4785565571508118e-05,
"loss": 0.007,
"step": 1023
},
{
"epoch": 3.89,
"grad_norm": 0.06959209717037654,
"learning_rate": 2.462341723421758e-05,
"loss": 0.0021,
"step": 1024
},
{
"epoch": 3.9,
"grad_norm": 0.1216172792972272,
"learning_rate": 2.4461726528100615e-05,
"loss": 0.0034,
"step": 1025
},
{
"epoch": 3.9,
"grad_norm": 0.1513509657366934,
"learning_rate": 2.4300494434824373e-05,
"loss": 0.004,
"step": 1026
},
{
"epoch": 3.9,
"grad_norm": 0.0865648646398974,
"learning_rate": 2.4139721933271465e-05,
"loss": 0.0024,
"step": 1027
},
{
"epoch": 3.91,
"grad_norm": 0.08556831061548267,
"learning_rate": 2.3979409999534298e-05,
"loss": 0.0023,
"step": 1028
},
{
"epoch": 3.91,
"grad_norm": 0.1641906844837052,
"learning_rate": 2.381955960690906e-05,
"loss": 0.0075,
"step": 1029
},
{
"epoch": 3.92,
"grad_norm": 0.11390717886890653,
"learning_rate": 2.36601717258897e-05,
"loss": 0.003,
"step": 1030
},
{
"epoch": 3.92,
"grad_norm": 0.11731050753260445,
"learning_rate": 2.35012473241623e-05,
"loss": 0.0028,
"step": 1031
},
{
"epoch": 3.92,
"grad_norm": 0.16970688657951408,
"learning_rate": 2.3342787366598872e-05,
"loss": 0.0043,
"step": 1032
},
{
"epoch": 3.93,
"grad_norm": 0.11089863057017167,
"learning_rate": 2.3184792815251766e-05,
"loss": 0.0032,
"step": 1033
},
{
"epoch": 3.93,
"grad_norm": 0.046462119059264825,
"learning_rate": 2.302726462934769e-05,
"loss": 0.0018,
"step": 1034
},
{
"epoch": 3.94,
"grad_norm": 0.11684974119182631,
"learning_rate": 2.2870203765281926e-05,
"loss": 0.0027,
"step": 1035
},
{
"epoch": 3.94,
"grad_norm": 0.16444221243072213,
"learning_rate": 2.2713611176612582e-05,
"loss": 0.0046,
"step": 1036
},
{
"epoch": 3.94,
"grad_norm": 0.1818375194889315,
"learning_rate": 2.25574878140546e-05,
"loss": 0.0051,
"step": 1037
},
{
"epoch": 3.95,
"grad_norm": 0.11786328596160862,
"learning_rate": 2.240183462547427e-05,
"loss": 0.0024,
"step": 1038
},
{
"epoch": 3.95,
"grad_norm": 0.08316727437427408,
"learning_rate": 2.224665255588325e-05,
"loss": 0.0031,
"step": 1039
},
{
"epoch": 3.95,
"grad_norm": 0.07270479819118501,
"learning_rate": 2.2091942547432955e-05,
"loss": 0.0027,
"step": 1040
},
{
"epoch": 3.96,
"grad_norm": 0.05459676934354277,
"learning_rate": 2.193770553940876e-05,
"loss": 0.0016,
"step": 1041
},
{
"epoch": 3.96,
"grad_norm": 0.11956646260392087,
"learning_rate": 2.1783942468224382e-05,
"loss": 0.0018,
"step": 1042
},
{
"epoch": 3.97,
"grad_norm": 0.08747129822087717,
"learning_rate": 2.163065426741603e-05,
"loss": 0.0025,
"step": 1043
},
{
"epoch": 3.97,
"grad_norm": 0.1777877170154106,
"learning_rate": 2.147784186763696e-05,
"loss": 0.0043,
"step": 1044
},
{
"epoch": 3.97,
"grad_norm": 0.08218483908984636,
"learning_rate": 2.132550619665168e-05,
"loss": 0.0018,
"step": 1045
},
{
"epoch": 3.98,
"grad_norm": 0.13582850245610714,
"learning_rate": 2.117364817933033e-05,
"loss": 0.0047,
"step": 1046
},
{
"epoch": 3.98,
"grad_norm": 0.3137890453832542,
"learning_rate": 2.1022268737643138e-05,
"loss": 0.0065,
"step": 1047
},
{
"epoch": 3.98,
"grad_norm": 0.14756824814987923,
"learning_rate": 2.08713687906547e-05,
"loss": 0.0046,
"step": 1048
},
{
"epoch": 3.99,
"grad_norm": 0.12543766199519707,
"learning_rate": 2.0720949254518517e-05,
"loss": 0.0033,
"step": 1049
},
{
"epoch": 3.99,
"grad_norm": 0.1272943584540691,
"learning_rate": 2.05710110424714e-05,
"loss": 0.0046,
"step": 1050
},
{
"epoch": 4.0,
"grad_norm": 0.18454830615921256,
"learning_rate": 2.0421555064827878e-05,
"loss": 0.0072,
"step": 1051
},
{
"epoch": 4.0,
"grad_norm": 0.1537809820642007,
"learning_rate": 2.0272582228974792e-05,
"loss": 0.0043,
"step": 1052
},
{
"epoch": 4.0,
"grad_norm": 0.05312631536933881,
"learning_rate": 2.012409343936551e-05,
"loss": 0.0012,
"step": 1053
},
{
"epoch": 4.01,
"grad_norm": 0.04447185122963781,
"learning_rate": 1.9976089597514903e-05,
"loss": 0.0012,
"step": 1054
},
{
"epoch": 4.01,
"grad_norm": 0.026337967004294745,
"learning_rate": 1.982857160199334e-05,
"loss": 0.0007,
"step": 1055
},
{
"epoch": 4.02,
"grad_norm": 0.026301866799899334,
"learning_rate": 1.9681540348421623e-05,
"loss": 0.001,
"step": 1056
},
{
"epoch": 4.02,
"grad_norm": 0.05436305337809422,
"learning_rate": 1.9534996729465426e-05,
"loss": 0.0014,
"step": 1057
},
{
"epoch": 4.02,
"grad_norm": 0.05631442649400675,
"learning_rate": 1.938894163482974e-05,
"loss": 0.0015,
"step": 1058
},
{
"epoch": 4.03,
"grad_norm": 0.10283079607735844,
"learning_rate": 1.9243375951253796e-05,
"loss": 0.0023,
"step": 1059
},
{
"epoch": 4.03,
"grad_norm": 0.050174636822106565,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.0011,
"step": 1060
},
{
"epoch": 4.03,
"grad_norm": 0.018160762498011917,
"learning_rate": 1.895371634937525e-05,
"loss": 0.0008,
"step": 1061
},
{
"epoch": 4.04,
"grad_norm": 0.053625104843317006,
"learning_rate": 1.880962418967279e-05,
"loss": 0.0017,
"step": 1062
},
{
"epoch": 4.04,
"grad_norm": 0.0367115996959884,
"learning_rate": 1.8666024958219408e-05,
"loss": 0.0013,
"step": 1063
},
{
"epoch": 4.05,
"grad_norm": 0.01705265689595668,
"learning_rate": 1.852291952684414e-05,
"loss": 0.0006,
"step": 1064
},
{
"epoch": 4.05,
"grad_norm": 0.029645947416295275,
"learning_rate": 1.8380308764377842e-05,
"loss": 0.001,
"step": 1065
},
{
"epoch": 4.05,
"grad_norm": 0.02616977130242847,
"learning_rate": 1.8238193536648195e-05,
"loss": 0.001,
"step": 1066
},
{
"epoch": 4.06,
"grad_norm": 0.029725522847751006,
"learning_rate": 1.8096574706474333e-05,
"loss": 0.0011,
"step": 1067
},
{
"epoch": 4.06,
"grad_norm": 0.046999003971840925,
"learning_rate": 1.795545313366166e-05,
"loss": 0.0016,
"step": 1068
},
{
"epoch": 4.06,
"grad_norm": 0.0639334214169769,
"learning_rate": 1.7814829674996592e-05,
"loss": 0.002,
"step": 1069
},
{
"epoch": 4.07,
"grad_norm": 0.0367804832140112,
"learning_rate": 1.767470518424129e-05,
"loss": 0.0011,
"step": 1070
},
{
"epoch": 4.07,
"grad_norm": 0.029348342947469082,
"learning_rate": 1.7535080512128632e-05,
"loss": 0.0008,
"step": 1071
},
{
"epoch": 4.08,
"grad_norm": 0.0466731334600783,
"learning_rate": 1.7395956506356937e-05,
"loss": 0.0015,
"step": 1072
},
{
"epoch": 4.08,
"grad_norm": 0.027508272565636798,
"learning_rate": 1.7257334011584847e-05,
"loss": 0.0009,
"step": 1073
},
{
"epoch": 4.08,
"grad_norm": 0.035628751171613196,
"learning_rate": 1.7119213869426197e-05,
"loss": 0.001,
"step": 1074
},
{
"epoch": 4.09,
"grad_norm": 0.06971830663806321,
"learning_rate": 1.6981596918444953e-05,
"loss": 0.0012,
"step": 1075
},
{
"epoch": 4.09,
"grad_norm": 0.021694267386634027,
"learning_rate": 1.684448399414994e-05,
"loss": 0.0009,
"step": 1076
},
{
"epoch": 4.1,
"grad_norm": 0.02007528840430175,
"learning_rate": 1.6707875928990058e-05,
"loss": 0.0008,
"step": 1077
},
{
"epoch": 4.1,
"grad_norm": 0.04559222251809475,
"learning_rate": 1.6571773552349e-05,
"loss": 0.0011,
"step": 1078
},
{
"epoch": 4.1,
"grad_norm": 0.03837022867264384,
"learning_rate": 1.6436177690540243e-05,
"loss": 0.0013,
"step": 1079
},
{
"epoch": 4.11,
"grad_norm": 0.0596087088326944,
"learning_rate": 1.630108916680223e-05,
"loss": 0.0017,
"step": 1080
},
{
"epoch": 4.11,
"grad_norm": 0.017751318308507253,
"learning_rate": 1.6166508801293013e-05,
"loss": 0.0006,
"step": 1081
},
{
"epoch": 4.11,
"grad_norm": 0.026619692717408015,
"learning_rate": 1.603243741108571e-05,
"loss": 0.0013,
"step": 1082
},
{
"epoch": 4.12,
"grad_norm": 0.020963517608948525,
"learning_rate": 1.5898875810163137e-05,
"loss": 0.0007,
"step": 1083
},
{
"epoch": 4.12,
"grad_norm": 0.027136972648439773,
"learning_rate": 1.5765824809413056e-05,
"loss": 0.0009,
"step": 1084
},
{
"epoch": 4.13,
"grad_norm": 0.05016619274243327,
"learning_rate": 1.5633285216623385e-05,
"loss": 0.0013,
"step": 1085
},
{
"epoch": 4.13,
"grad_norm": 0.029039341510529837,
"learning_rate": 1.5501257836476978e-05,
"loss": 0.0009,
"step": 1086
},
{
"epoch": 4.13,
"grad_norm": 0.07303975658197855,
"learning_rate": 1.5369743470547027e-05,
"loss": 0.0019,
"step": 1087
},
{
"epoch": 4.14,
"grad_norm": 0.03350400586584435,
"learning_rate": 1.5238742917292015e-05,
"loss": 0.0009,
"step": 1088
},
{
"epoch": 4.14,
"grad_norm": 0.08029935115169112,
"learning_rate": 1.5108256972050972e-05,
"loss": 0.0019,
"step": 1089
},
{
"epoch": 4.14,
"grad_norm": 0.04476861542864655,
"learning_rate": 1.4978286427038601e-05,
"loss": 0.0014,
"step": 1090
},
{
"epoch": 4.15,
"grad_norm": 0.0692481373749192,
"learning_rate": 1.4848832071340423e-05,
"loss": 0.0014,
"step": 1091
},
{
"epoch": 4.15,
"grad_norm": 0.0459173556561311,
"learning_rate": 1.4719894690908098e-05,
"loss": 0.0013,
"step": 1092
},
{
"epoch": 4.16,
"grad_norm": 0.05681694784227542,
"learning_rate": 1.4591475068554572e-05,
"loss": 0.0017,
"step": 1093
},
{
"epoch": 4.16,
"grad_norm": 0.03490009549853983,
"learning_rate": 1.4463573983949341e-05,
"loss": 0.001,
"step": 1094
},
{
"epoch": 4.16,
"grad_norm": 0.08256537486908391,
"learning_rate": 1.4336192213613742e-05,
"loss": 0.0029,
"step": 1095
},
{
"epoch": 4.17,
"grad_norm": 0.03304929283596535,
"learning_rate": 1.4209330530916165e-05,
"loss": 0.0015,
"step": 1096
},
{
"epoch": 4.17,
"grad_norm": 0.019413105962450914,
"learning_rate": 1.4082989706067461e-05,
"loss": 0.0008,
"step": 1097
},
{
"epoch": 4.17,
"grad_norm": 0.06390145660785629,
"learning_rate": 1.3957170506116201e-05,
"loss": 0.0009,
"step": 1098
},
{
"epoch": 4.18,
"grad_norm": 0.017555521429492564,
"learning_rate": 1.3831873694944031e-05,
"loss": 0.0007,
"step": 1099
},
{
"epoch": 4.18,
"grad_norm": 0.04810371136355367,
"learning_rate": 1.3707100033261034e-05,
"loss": 0.0014,
"step": 1100
},
{
"epoch": 4.18,
"eval_blimp_filtered_avg": 0.716865671641791,
"eval_blimp_filtered_std": 0.005034673021350593,
"step": 1100
},
{
"epoch": 4.18,
"eval_blimp_supplement_avg": 0.834051724137931,
"eval_blimp_supplement_std": 0.016524406240927558,
"step": 1100
},
{
"epoch": 4.18,
"eval_vqa_filtered_avg": 0.48,
"eval_vqa_filtered_std": 0.05021167315686779,
"step": 1100
},
{
"epoch": 4.18,
"eval_winoground_filtered_avg": 0.68,
"eval_winoground_filtered_std": 0.046882617226215034,
"step": 1100
},
{
"epoch": 4.19,
"grad_norm": 0.013464323565587763,
"learning_rate": 1.3582850278601134e-05,
"loss": 0.0006,
"step": 1101
},
{
"epoch": 4.19,
"grad_norm": 0.029333816746967,
"learning_rate": 1.3459125185317434e-05,
"loss": 0.0009,
"step": 1102
},
{
"epoch": 4.19,
"grad_norm": 0.019738111761638182,
"learning_rate": 1.3335925504577717e-05,
"loss": 0.0007,
"step": 1103
},
{
"epoch": 4.2,
"grad_norm": 0.038558679072453785,
"learning_rate": 1.3213251984359831e-05,
"loss": 0.0009,
"step": 1104
},
{
"epoch": 4.2,
"grad_norm": 0.10728075730756652,
"learning_rate": 1.3091105369447165e-05,
"loss": 0.0014,
"step": 1105
},
{
"epoch": 4.21,
"grad_norm": 0.03939456309548406,
"learning_rate": 1.2969486401424169e-05,
"loss": 0.0009,
"step": 1106
},
{
"epoch": 4.21,
"grad_norm": 0.02688561690752942,
"learning_rate": 1.2848395818671687e-05,
"loss": 0.0007,
"step": 1107
},
{
"epoch": 4.21,
"grad_norm": 0.10118668815662003,
"learning_rate": 1.2727834356362778e-05,
"loss": 0.0035,
"step": 1108
},
{
"epoch": 4.22,
"grad_norm": 0.029540871735792434,
"learning_rate": 1.2607802746457897e-05,
"loss": 0.0008,
"step": 1109
},
{
"epoch": 4.22,
"grad_norm": 0.019004632066757458,
"learning_rate": 1.2488301717700735e-05,
"loss": 0.0008,
"step": 1110
},
{
"epoch": 4.22,
"grad_norm": 0.050506924718716324,
"learning_rate": 1.2369331995613665e-05,
"loss": 0.0015,
"step": 1111
},
{
"epoch": 4.23,
"grad_norm": 0.02906757664336032,
"learning_rate": 1.2250894302493265e-05,
"loss": 0.0009,
"step": 1112
},
{
"epoch": 4.23,
"grad_norm": 0.04633777117624858,
"learning_rate": 1.21329893574062e-05,
"loss": 0.0013,
"step": 1113
},
{
"epoch": 4.24,
"grad_norm": 0.03520147966889168,
"learning_rate": 1.2015617876184527e-05,
"loss": 0.0007,
"step": 1114
},
{
"epoch": 4.24,
"grad_norm": 0.02696880607305335,
"learning_rate": 1.1898780571421552e-05,
"loss": 0.0009,
"step": 1115
},
{
"epoch": 4.24,
"grad_norm": 0.028899261052257607,
"learning_rate": 1.1782478152467502e-05,
"loss": 0.0009,
"step": 1116
},
{
"epoch": 4.25,
"grad_norm": 0.025787842105068102,
"learning_rate": 1.166671132542505e-05,
"loss": 0.0007,
"step": 1117
},
{
"epoch": 4.25,
"grad_norm": 0.023827728319133067,
"learning_rate": 1.1551480793145331e-05,
"loss": 0.0009,
"step": 1118
},
{
"epoch": 4.25,
"grad_norm": 0.030668462671104377,
"learning_rate": 1.1436787255223302e-05,
"loss": 0.0007,
"step": 1119
},
{
"epoch": 4.26,
"grad_norm": 0.02229439145398785,
"learning_rate": 1.1322631407993811e-05,
"loss": 0.0008,
"step": 1120
},
{
"epoch": 4.26,
"grad_norm": 0.08225114568135586,
"learning_rate": 1.1209013944527203e-05,
"loss": 0.0017,
"step": 1121
},
{
"epoch": 4.27,
"grad_norm": 0.0575147043161383,
"learning_rate": 1.1095935554625148e-05,
"loss": 0.0011,
"step": 1122
},
{
"epoch": 4.27,
"grad_norm": 0.05879183520465905,
"learning_rate": 1.098339692481648e-05,
"loss": 0.0016,
"step": 1123
},
{
"epoch": 4.27,
"grad_norm": 0.11651577381341721,
"learning_rate": 1.0871398738352955e-05,
"loss": 0.0017,
"step": 1124
},
{
"epoch": 4.28,
"grad_norm": 0.04216428635525978,
"learning_rate": 1.0759941675205221e-05,
"loss": 0.0016,
"step": 1125
},
{
"epoch": 4.28,
"grad_norm": 0.03712542340164034,
"learning_rate": 1.0649026412058583e-05,
"loss": 0.0012,
"step": 1126
},
{
"epoch": 4.29,
"grad_norm": 0.04116733983643567,
"learning_rate": 1.0538653622308948e-05,
"loss": 0.0009,
"step": 1127
},
{
"epoch": 4.29,
"grad_norm": 0.05379118870775519,
"learning_rate": 1.042882397605871e-05,
"loss": 0.0014,
"step": 1128
},
{
"epoch": 4.29,
"grad_norm": 0.023844321094186195,
"learning_rate": 1.0319538140112728e-05,
"loss": 0.0008,
"step": 1129
},
{
"epoch": 4.3,
"grad_norm": 0.02336448873502654,
"learning_rate": 1.0210796777974197e-05,
"loss": 0.0007,
"step": 1130
},
{
"epoch": 4.3,
"grad_norm": 0.04194993258789071,
"learning_rate": 1.0102600549840701e-05,
"loss": 0.0013,
"step": 1131
},
{
"epoch": 4.3,
"grad_norm": 0.054557517394749656,
"learning_rate": 9.994950112600154e-06,
"loss": 0.0011,
"step": 1132
},
{
"epoch": 4.31,
"grad_norm": 0.15976956920768468,
"learning_rate": 9.887846119826849e-06,
"loss": 0.0023,
"step": 1133
},
{
"epoch": 4.31,
"grad_norm": 0.05469859681916068,
"learning_rate": 9.781289221777478e-06,
"loss": 0.0011,
"step": 1134
},
{
"epoch": 4.32,
"grad_norm": 0.0330826221094112,
"learning_rate": 9.675280065387116e-06,
"loss": 0.0012,
"step": 1135
},
{
"epoch": 4.32,
"grad_norm": 0.037994630222178825,
"learning_rate": 9.569819294265414e-06,
"loss": 0.0007,
"step": 1136
},
{
"epoch": 4.32,
"grad_norm": 0.05446067811636603,
"learning_rate": 9.464907548692614e-06,
"loss": 0.0012,
"step": 1137
},
{
"epoch": 4.33,
"grad_norm": 0.030162059858075463,
"learning_rate": 9.360545465615667e-06,
"loss": 0.0009,
"step": 1138
},
{
"epoch": 4.33,
"grad_norm": 0.01776810562751649,
"learning_rate": 9.256733678644414e-06,
"loss": 0.0006,
"step": 1139
},
{
"epoch": 4.33,
"grad_norm": 0.07017734423776259,
"learning_rate": 9.153472818047625e-06,
"loss": 0.0012,
"step": 1140
},
{
"epoch": 4.34,
"grad_norm": 0.049265529448664684,
"learning_rate": 9.05076351074936e-06,
"loss": 0.0009,
"step": 1141
},
{
"epoch": 4.34,
"grad_norm": 0.01911138033873858,
"learning_rate": 8.948606380324941e-06,
"loss": 0.0008,
"step": 1142
},
{
"epoch": 4.35,
"grad_norm": 0.03207512532598277,
"learning_rate": 8.847002046997354e-06,
"loss": 0.0008,
"step": 1143
},
{
"epoch": 4.35,
"grad_norm": 0.09498054680592705,
"learning_rate": 8.745951127633411e-06,
"loss": 0.0015,
"step": 1144
},
{
"epoch": 4.35,
"grad_norm": 0.02206935138105853,
"learning_rate": 8.645454235739903e-06,
"loss": 0.0007,
"step": 1145
},
{
"epoch": 4.36,
"grad_norm": 0.03319965177433449,
"learning_rate": 8.54551198146013e-06,
"loss": 0.0007,
"step": 1146
},
{
"epoch": 4.36,
"grad_norm": 0.02840733607459401,
"learning_rate": 8.44612497156989e-06,
"loss": 0.0008,
"step": 1147
},
{
"epoch": 4.37,
"grad_norm": 0.018408351544896853,
"learning_rate": 8.347293809474054e-06,
"loss": 0.0006,
"step": 1148
},
{
"epoch": 4.37,
"grad_norm": 0.08136631297548408,
"learning_rate": 8.249019095202736e-06,
"loss": 0.0025,
"step": 1149
},
{
"epoch": 4.37,
"grad_norm": 0.03770345328525376,
"learning_rate": 8.151301425407699e-06,
"loss": 0.0011,
"step": 1150
},
{
"epoch": 4.38,
"grad_norm": 0.021828484215553397,
"learning_rate": 8.054141393358816e-06,
"loss": 0.0008,
"step": 1151
},
{
"epoch": 4.38,
"grad_norm": 0.06858372388136494,
"learning_rate": 7.957539588940299e-06,
"loss": 0.0013,
"step": 1152
},
{
"epoch": 4.38,
"grad_norm": 0.03540771842662069,
"learning_rate": 7.861496598647278e-06,
"loss": 0.0009,
"step": 1153
},
{
"epoch": 4.39,
"grad_norm": 0.06376796414899714,
"learning_rate": 7.76601300558214e-06,
"loss": 0.0017,
"step": 1154
},
{
"epoch": 4.39,
"grad_norm": 0.08541088279435645,
"learning_rate": 7.671089389451058e-06,
"loss": 0.0012,
"step": 1155
},
{
"epoch": 4.4,
"grad_norm": 0.01913967487748475,
"learning_rate": 7.576726326560424e-06,
"loss": 0.0008,
"step": 1156
},
{
"epoch": 4.4,
"grad_norm": 0.01505834145399495,
"learning_rate": 7.482924389813317e-06,
"loss": 0.0006,
"step": 1157
},
{
"epoch": 4.4,
"grad_norm": 0.029574633808729885,
"learning_rate": 7.389684148706122e-06,
"loss": 0.0008,
"step": 1158
},
{
"epoch": 4.41,
"grad_norm": 0.07775324960665371,
"learning_rate": 7.2970061693250154e-06,
"loss": 0.0011,
"step": 1159
},
{
"epoch": 4.41,
"grad_norm": 0.03207503686714158,
"learning_rate": 7.204891014342552e-06,
"loss": 0.0009,
"step": 1160
},
{
"epoch": 4.41,
"grad_norm": 0.02888329550946116,
"learning_rate": 7.113339243014139e-06,
"loss": 0.0008,
"step": 1161
},
{
"epoch": 4.42,
"grad_norm": 0.0194570206084492,
"learning_rate": 7.022351411174866e-06,
"loss": 0.0008,
"step": 1162
},
{
"epoch": 4.42,
"grad_norm": 0.14398642426432864,
"learning_rate": 6.931928071235894e-06,
"loss": 0.0032,
"step": 1163
},
{
"epoch": 4.43,
"grad_norm": 0.09922204312397419,
"learning_rate": 6.842069772181236e-06,
"loss": 0.0019,
"step": 1164
},
{
"epoch": 4.43,
"grad_norm": 0.04657668585468913,
"learning_rate": 6.75277705956443e-06,
"loss": 0.001,
"step": 1165
},
{
"epoch": 4.43,
"grad_norm": 0.014151142624219092,
"learning_rate": 6.664050475505101e-06,
"loss": 0.0006,
"step": 1166
},
{
"epoch": 4.44,
"grad_norm": 0.041876195216166465,
"learning_rate": 6.575890558685882e-06,
"loss": 0.0013,
"step": 1167
},
{
"epoch": 4.44,
"grad_norm": 0.02777421677729617,
"learning_rate": 6.48829784434889e-06,
"loss": 0.0008,
"step": 1168
},
{
"epoch": 4.44,
"grad_norm": 0.03827771284490116,
"learning_rate": 6.4012728642926845e-06,
"loss": 0.0015,
"step": 1169
},
{
"epoch": 4.45,
"grad_norm": 0.1320945724487641,
"learning_rate": 6.314816146868952e-06,
"loss": 0.0021,
"step": 1170
},
{
"epoch": 4.45,
"grad_norm": 0.07196675390680835,
"learning_rate": 6.228928216979257e-06,
"loss": 0.0027,
"step": 1171
},
{
"epoch": 4.46,
"grad_norm": 0.06301629094647111,
"learning_rate": 6.143609596072008e-06,
"loss": 0.0008,
"step": 1172
},
{
"epoch": 4.46,
"grad_norm": 0.04690844306331864,
"learning_rate": 6.0588608021390655e-06,
"loss": 0.0012,
"step": 1173
},
{
"epoch": 4.46,
"grad_norm": 0.02355863678935003,
"learning_rate": 5.97468234971279e-06,
"loss": 0.0007,
"step": 1174
},
{
"epoch": 4.47,
"grad_norm": 0.012557456867681805,
"learning_rate": 5.891074749862857e-06,
"loss": 0.0005,
"step": 1175
},
{
"epoch": 4.47,
"grad_norm": 0.06758959915320514,
"learning_rate": 5.80803851019307e-06,
"loss": 0.0013,
"step": 1176
},
{
"epoch": 4.48,
"grad_norm": 0.03280577696402171,
"learning_rate": 5.725574134838474e-06,
"loss": 0.0011,
"step": 1177
},
{
"epoch": 4.48,
"grad_norm": 0.01838083263589963,
"learning_rate": 5.643682124462057e-06,
"loss": 0.0007,
"step": 1178
},
{
"epoch": 4.48,
"grad_norm": 0.09736850218115797,
"learning_rate": 5.562362976251901e-06,
"loss": 0.0026,
"step": 1179
},
{
"epoch": 4.49,
"grad_norm": 0.038368083330140405,
"learning_rate": 5.481617183918053e-06,
"loss": 0.0012,
"step": 1180
},
{
"epoch": 4.49,
"grad_norm": 0.03241919329467971,
"learning_rate": 5.401445237689573e-06,
"loss": 0.0007,
"step": 1181
},
{
"epoch": 4.49,
"grad_norm": 0.019548513203401407,
"learning_rate": 5.321847624311593e-06,
"loss": 0.0007,
"step": 1182
},
{
"epoch": 4.5,
"grad_norm": 0.03343420920569663,
"learning_rate": 5.242824827042237e-06,
"loss": 0.0006,
"step": 1183
},
{
"epoch": 4.5,
"grad_norm": 0.013860278165749494,
"learning_rate": 5.1643773256498164e-06,
"loss": 0.0004,
"step": 1184
},
{
"epoch": 4.51,
"grad_norm": 0.03250235438008899,
"learning_rate": 5.086505596409885e-06,
"loss": 0.0008,
"step": 1185
},
{
"epoch": 4.51,
"grad_norm": 0.03812286821993014,
"learning_rate": 5.009210112102292e-06,
"loss": 0.001,
"step": 1186
},
{
"epoch": 4.51,
"grad_norm": 0.04724744059732396,
"learning_rate": 4.932491342008383e-06,
"loss": 0.0008,
"step": 1187
},
{
"epoch": 4.52,
"grad_norm": 0.07149621133781368,
"learning_rate": 4.856349751908107e-06,
"loss": 0.0021,
"step": 1188
},
{
"epoch": 4.52,
"grad_norm": 0.011104684384602798,
"learning_rate": 4.7807858040771924e-06,
"loss": 0.0005,
"step": 1189
},
{
"epoch": 4.52,
"grad_norm": 0.02671249567166847,
"learning_rate": 4.705799957284351e-06,
"loss": 0.001,
"step": 1190
},
{
"epoch": 4.53,
"grad_norm": 0.11429993772721231,
"learning_rate": 4.6313926667885035e-06,
"loss": 0.0019,
"step": 1191
},
{
"epoch": 4.53,
"grad_norm": 0.019599526292067716,
"learning_rate": 4.557564384335977e-06,
"loss": 0.0007,
"step": 1192
},
{
"epoch": 4.54,
"grad_norm": 0.023058983963623273,
"learning_rate": 4.4843155581578474e-06,
"loss": 0.0007,
"step": 1193
},
{
"epoch": 4.54,
"grad_norm": 0.01789699934131272,
"learning_rate": 4.411646632967059e-06,
"loss": 0.0006,
"step": 1194
},
{
"epoch": 4.54,
"grad_norm": 0.03475695043132507,
"learning_rate": 4.339558049955927e-06,
"loss": 0.0009,
"step": 1195
},
{
"epoch": 4.55,
"grad_norm": 0.04626061457798834,
"learning_rate": 4.268050246793276e-06,
"loss": 0.001,
"step": 1196
},
{
"epoch": 4.55,
"grad_norm": 0.03777665253578553,
"learning_rate": 4.197123657621915e-06,
"loss": 0.0009,
"step": 1197
},
{
"epoch": 4.56,
"grad_norm": 0.08401265052642046,
"learning_rate": 4.126778713055923e-06,
"loss": 0.0018,
"step": 1198
},
{
"epoch": 4.56,
"grad_norm": 0.01795488576011003,
"learning_rate": 4.0570158401780465e-06,
"loss": 0.0006,
"step": 1199
},
{
"epoch": 4.56,
"grad_norm": 0.05169945359266721,
"learning_rate": 3.987835462537193e-06,
"loss": 0.0012,
"step": 1200
},
{
"epoch": 4.56,
"eval_blimp_filtered_avg": 0.7150746268656717,
"eval_blimp_filtered_std": 0.0050445406050592155,
"step": 1200
},
{
"epoch": 4.56,
"eval_blimp_supplement_avg": 0.8362068965517241,
"eval_blimp_supplement_std": 0.016487870687225285,
"step": 1200
},
{
"epoch": 4.56,
"eval_vqa_filtered_avg": 0.51,
"eval_vqa_filtered_std": 0.05024183937956912,
"step": 1200
},
{
"epoch": 4.56,
"eval_winoground_filtered_avg": 0.67,
"eval_winoground_filtered_std": 0.04725815626252606,
"step": 1200
},
{
"epoch": 4.57,
"grad_norm": 0.03549591031661078,
"learning_rate": 3.919238000145687e-06,
"loss": 0.0014,
"step": 1201
},
{
"epoch": 4.57,
"grad_norm": 0.12212372267764961,
"learning_rate": 3.8512238694768835e-06,
"loss": 0.0018,
"step": 1202
},
{
"epoch": 4.57,
"grad_norm": 0.0708069932236777,
"learning_rate": 3.783793483462583e-06,
"loss": 0.0016,
"step": 1203
},
{
"epoch": 4.58,
"grad_norm": 0.07637524055458063,
"learning_rate": 3.7169472514904544e-06,
"loss": 0.001,
"step": 1204
},
{
"epoch": 4.58,
"grad_norm": 0.03487034956626954,
"learning_rate": 3.6506855794016913e-06,
"loss": 0.0012,
"step": 1205
},
{
"epoch": 4.59,
"grad_norm": 0.026640228769618846,
"learning_rate": 3.5850088694884156e-06,
"loss": 0.0009,
"step": 1206
},
{
"epoch": 4.59,
"grad_norm": 0.012878283131233357,
"learning_rate": 3.5199175204913117e-06,
"loss": 0.0005,
"step": 1207
},
{
"epoch": 4.59,
"grad_norm": 0.04703267681123701,
"learning_rate": 3.455411927597185e-06,
"loss": 0.0012,
"step": 1208
},
{
"epoch": 4.6,
"grad_norm": 0.0153560104298258,
"learning_rate": 3.3914924824365422e-06,
"loss": 0.0006,
"step": 1209
},
{
"epoch": 4.6,
"grad_norm": 0.05920573168766921,
"learning_rate": 3.3281595730812575e-06,
"loss": 0.001,
"step": 1210
},
{
"epoch": 4.6,
"grad_norm": 0.07240573247327886,
"learning_rate": 3.265413584042165e-06,
"loss": 0.0015,
"step": 1211
},
{
"epoch": 4.61,
"grad_norm": 0.03877978634973686,
"learning_rate": 3.203254896266761e-06,
"loss": 0.0009,
"step": 1212
},
{
"epoch": 4.61,
"grad_norm": 0.0201160866989832,
"learning_rate": 3.1416838871368924e-06,
"loss": 0.0009,
"step": 1213
},
{
"epoch": 4.62,
"grad_norm": 0.048187931567094715,
"learning_rate": 3.080700930466429e-06,
"loss": 0.0012,
"step": 1214
},
{
"epoch": 4.62,
"grad_norm": 0.07174256245695844,
"learning_rate": 3.0203063964990617e-06,
"loss": 0.0018,
"step": 1215
},
{
"epoch": 4.62,
"grad_norm": 0.03188905472254505,
"learning_rate": 2.9605006519059507e-06,
"loss": 0.0009,
"step": 1216
},
{
"epoch": 4.63,
"grad_norm": 0.026587740185543742,
"learning_rate": 2.9012840597836045e-06,
"loss": 0.0007,
"step": 1217
},
{
"epoch": 4.63,
"grad_norm": 0.051838257669286805,
"learning_rate": 2.8426569796516146e-06,
"loss": 0.0011,
"step": 1218
},
{
"epoch": 4.63,
"grad_norm": 0.027884361782616187,
"learning_rate": 2.7846197674504913e-06,
"loss": 0.0007,
"step": 1219
},
{
"epoch": 4.64,
"grad_norm": 0.05934491011655865,
"learning_rate": 2.7271727755395214e-06,
"loss": 0.0014,
"step": 1220
},
{
"epoch": 4.64,
"grad_norm": 0.07722505492466579,
"learning_rate": 2.6703163526945794e-06,
"loss": 0.0014,
"step": 1221
},
{
"epoch": 4.65,
"grad_norm": 0.017911785319840934,
"learning_rate": 2.614050844106042e-06,
"loss": 0.0006,
"step": 1222
},
{
"epoch": 4.65,
"grad_norm": 0.07048675908478848,
"learning_rate": 2.5583765913766987e-06,
"loss": 0.0012,
"step": 1223
},
{
"epoch": 4.65,
"grad_norm": 0.029385343497706086,
"learning_rate": 2.503293932519668e-06,
"loss": 0.0011,
"step": 1224
},
{
"epoch": 4.66,
"grad_norm": 0.026721071049153142,
"learning_rate": 2.4488032019563402e-06,
"loss": 0.0008,
"step": 1225
},
{
"epoch": 4.66,
"grad_norm": 0.05733009421653368,
"learning_rate": 2.394904730514358e-06,
"loss": 0.0008,
"step": 1226
},
{
"epoch": 4.67,
"grad_norm": 0.048459011121664085,
"learning_rate": 2.3415988454255855e-06,
"loss": 0.001,
"step": 1227
},
{
"epoch": 4.67,
"grad_norm": 0.08348067589435784,
"learning_rate": 2.2888858703241866e-06,
"loss": 0.0015,
"step": 1228
},
{
"epoch": 4.67,
"grad_norm": 0.0723135438217715,
"learning_rate": 2.236766125244549e-06,
"loss": 0.0013,
"step": 1229
},
{
"epoch": 4.68,
"grad_norm": 0.043987088946181196,
"learning_rate": 2.1852399266194314e-06,
"loss": 0.0012,
"step": 1230
},
{
"epoch": 4.68,
"grad_norm": 0.02610918331075917,
"learning_rate": 2.1343075872780396e-06,
"loss": 0.0008,
"step": 1231
},
{
"epoch": 4.68,
"grad_norm": 0.02610217806342308,
"learning_rate": 2.0839694164440425e-06,
"loss": 0.0007,
"step": 1232
},
{
"epoch": 4.69,
"grad_norm": 0.016676961400433666,
"learning_rate": 2.03422571973384e-06,
"loss": 0.0007,
"step": 1233
},
{
"epoch": 4.69,
"grad_norm": 0.0712789068176972,
"learning_rate": 1.985076799154528e-06,
"loss": 0.0011,
"step": 1234
},
{
"epoch": 4.7,
"grad_norm": 0.040623193014393696,
"learning_rate": 1.9365229531022264e-06,
"loss": 0.0009,
"step": 1235
},
{
"epoch": 4.7,
"grad_norm": 0.0174120811798264,
"learning_rate": 1.8885644763601774e-06,
"loss": 0.0006,
"step": 1236
},
{
"epoch": 4.7,
"grad_norm": 0.02397234113764592,
"learning_rate": 1.8412016600969695e-06,
"loss": 0.0008,
"step": 1237
},
{
"epoch": 4.71,
"grad_norm": 0.04262503636577534,
"learning_rate": 1.7944347918648185e-06,
"loss": 0.0011,
"step": 1238
},
{
"epoch": 4.71,
"grad_norm": 0.02559933629938596,
"learning_rate": 1.748264155597712e-06,
"loss": 0.0007,
"step": 1239
},
{
"epoch": 4.71,
"grad_norm": 0.024338642632736805,
"learning_rate": 1.7026900316098215e-06,
"loss": 0.0007,
"step": 1240
},
{
"epoch": 4.72,
"grad_norm": 0.04865116104780992,
"learning_rate": 1.657712696593705e-06,
"loss": 0.0014,
"step": 1241
},
{
"epoch": 4.72,
"grad_norm": 0.016653138748607372,
"learning_rate": 1.6133324236186742e-06,
"loss": 0.0006,
"step": 1242
},
{
"epoch": 4.73,
"grad_norm": 0.02913263827559182,
"learning_rate": 1.5695494821290735e-06,
"loss": 0.0007,
"step": 1243
},
{
"epoch": 4.73,
"grad_norm": 0.1053160720503692,
"learning_rate": 1.5263641379427595e-06,
"loss": 0.0022,
"step": 1244
},
{
"epoch": 4.73,
"grad_norm": 0.06917659494864752,
"learning_rate": 1.4837766532493468e-06,
"loss": 0.0014,
"step": 1245
},
{
"epoch": 4.74,
"grad_norm": 0.034919075850131744,
"learning_rate": 1.4417872866087534e-06,
"loss": 0.0006,
"step": 1246
},
{
"epoch": 4.74,
"grad_norm": 0.07398471019830524,
"learning_rate": 1.400396292949513e-06,
"loss": 0.0013,
"step": 1247
},
{
"epoch": 4.75,
"grad_norm": 0.02043767100243107,
"learning_rate": 1.3596039235672874e-06,
"loss": 0.0007,
"step": 1248
},
{
"epoch": 4.75,
"grad_norm": 0.026716481737152874,
"learning_rate": 1.3194104261233798e-06,
"loss": 0.0007,
"step": 1249
},
{
"epoch": 4.75,
"grad_norm": 0.04718046174374857,
"learning_rate": 1.2798160446431006e-06,
"loss": 0.0013,
"step": 1250
},
{
"epoch": 4.76,
"grad_norm": 0.053929792186028926,
"learning_rate": 1.2408210195144376e-06,
"loss": 0.001,
"step": 1251
},
{
"epoch": 4.76,
"grad_norm": 0.016211755152668282,
"learning_rate": 1.2024255874865108e-06,
"loss": 0.0006,
"step": 1252
},
{
"epoch": 4.76,
"grad_norm": 0.02218925262348946,
"learning_rate": 1.1646299816681195e-06,
"loss": 0.0006,
"step": 1253
},
{
"epoch": 4.77,
"grad_norm": 0.061559821718042196,
"learning_rate": 1.1274344315264196e-06,
"loss": 0.0018,
"step": 1254
},
{
"epoch": 4.77,
"grad_norm": 0.011397207501113183,
"learning_rate": 1.0908391628854041e-06,
"loss": 0.0005,
"step": 1255
},
{
"epoch": 4.78,
"grad_norm": 0.022367382815004444,
"learning_rate": 1.0548443979246481e-06,
"loss": 0.0008,
"step": 1256
},
{
"epoch": 4.78,
"grad_norm": 0.11917770316499016,
"learning_rate": 1.0194503551778866e-06,
"loss": 0.0016,
"step": 1257
},
{
"epoch": 4.78,
"grad_norm": 0.023235418948401518,
"learning_rate": 9.846572495316952e-07,
"loss": 0.0007,
"step": 1258
},
{
"epoch": 4.79,
"grad_norm": 0.017722263245702797,
"learning_rate": 9.504652922242562e-07,
"loss": 0.0007,
"step": 1259
},
{
"epoch": 4.79,
"grad_norm": 0.15628375169850256,
"learning_rate": 9.168746908439718e-07,
"loss": 0.0026,
"step": 1260
},
{
"epoch": 4.79,
"grad_norm": 0.019997091181886018,
"learning_rate": 8.838856493282754e-07,
"loss": 0.0007,
"step": 1261
},
{
"epoch": 4.8,
"grad_norm": 0.05125265984626611,
"learning_rate": 8.514983679623556e-07,
"loss": 0.001,
"step": 1262
},
{
"epoch": 4.8,
"grad_norm": 0.014670438570475425,
"learning_rate": 8.197130433779565e-07,
"loss": 0.0005,
"step": 1263
},
{
"epoch": 4.81,
"grad_norm": 0.06613452163795065,
"learning_rate": 7.885298685522235e-07,
"loss": 0.0024,
"step": 1264
},
{
"epoch": 4.81,
"grad_norm": 0.03851957779319382,
"learning_rate": 7.579490328064265e-07,
"loss": 0.0013,
"step": 1265
},
{
"epoch": 4.81,
"grad_norm": 0.01550807081268757,
"learning_rate": 7.27970721804927e-07,
"loss": 0.0006,
"step": 1266
},
{
"epoch": 4.82,
"grad_norm": 0.02645554854912103,
"learning_rate": 6.985951175539796e-07,
"loss": 0.0007,
"step": 1267
},
{
"epoch": 4.82,
"grad_norm": 0.03279121203073833,
"learning_rate": 6.698223984006436e-07,
"loss": 0.0009,
"step": 1268
},
{
"epoch": 4.83,
"grad_norm": 0.0519224271730649,
"learning_rate": 6.416527390317173e-07,
"loss": 0.0009,
"step": 1269
},
{
"epoch": 4.83,
"grad_norm": 0.09986357170391567,
"learning_rate": 6.140863104726391e-07,
"loss": 0.0016,
"step": 1270
},
{
"epoch": 4.83,
"grad_norm": 0.03275074991380398,
"learning_rate": 5.87123280086499e-07,
"loss": 0.0007,
"step": 1271
},
{
"epoch": 4.84,
"grad_norm": 0.04587670912561613,
"learning_rate": 5.607638115729841e-07,
"loss": 0.0015,
"step": 1272
},
{
"epoch": 4.84,
"grad_norm": 0.027686155223001784,
"learning_rate": 5.350080649674127e-07,
"loss": 0.0007,
"step": 1273
},
{
"epoch": 4.84,
"grad_norm": 0.0223211970919243,
"learning_rate": 5.098561966397131e-07,
"loss": 0.0006,
"step": 1274
},
{
"epoch": 4.85,
"grad_norm": 0.017928512348313316,
"learning_rate": 4.85308359293557e-07,
"loss": 0.0005,
"step": 1275
},
{
"epoch": 4.85,
"grad_norm": 0.03475070764217293,
"learning_rate": 4.613647019653389e-07,
"loss": 0.0007,
"step": 1276
},
{
"epoch": 4.86,
"grad_norm": 0.020434320555203736,
"learning_rate": 4.3802537002335386e-07,
"loss": 0.0008,
"step": 1277
},
{
"epoch": 4.86,
"grad_norm": 0.025932942114901558,
"learning_rate": 4.152905051668321e-07,
"loss": 0.0008,
"step": 1278
},
{
"epoch": 4.86,
"grad_norm": 0.0629734875825493,
"learning_rate": 3.931602454251837e-07,
"loss": 0.0014,
"step": 1279
},
{
"epoch": 4.87,
"grad_norm": 0.017477717878726103,
"learning_rate": 3.716347251570551e-07,
"loss": 0.0006,
"step": 1280
},
{
"epoch": 4.87,
"grad_norm": 0.028129693282624404,
"learning_rate": 3.50714075049563e-07,
"loss": 0.0009,
"step": 1281
},
{
"epoch": 4.87,
"grad_norm": 0.3789011247756691,
"learning_rate": 3.3039842211752824e-07,
"loss": 0.0007,
"step": 1282
},
{
"epoch": 4.88,
"grad_norm": 0.02028981654233484,
"learning_rate": 3.106878897026544e-07,
"loss": 0.0007,
"step": 1283
},
{
"epoch": 4.88,
"grad_norm": 0.017043915032762993,
"learning_rate": 2.915825974727726e-07,
"loss": 0.0007,
"step": 1284
},
{
"epoch": 4.89,
"grad_norm": 0.02765997006700044,
"learning_rate": 2.7308266142119785e-07,
"loss": 0.0008,
"step": 1285
},
{
"epoch": 4.89,
"grad_norm": 0.05844451311284074,
"learning_rate": 2.5518819386590734e-07,
"loss": 0.0011,
"step": 1286
},
{
"epoch": 4.89,
"grad_norm": 0.05688879982444352,
"learning_rate": 2.3789930344897404e-07,
"loss": 0.0008,
"step": 1287
},
{
"epoch": 4.9,
"grad_norm": 0.028207898085014158,
"learning_rate": 2.212160951358011e-07,
"loss": 0.001,
"step": 1288
},
{
"epoch": 4.9,
"grad_norm": 0.016086514604394798,
"learning_rate": 2.0513867021457744e-07,
"loss": 0.0007,
"step": 1289
},
{
"epoch": 4.9,
"grad_norm": 0.02971312168354947,
"learning_rate": 1.8966712629558957e-07,
"loss": 0.0009,
"step": 1290
},
{
"epoch": 4.91,
"grad_norm": 0.021944051433958625,
"learning_rate": 1.748015573106887e-07,
"loss": 0.0007,
"step": 1291
},
{
"epoch": 4.91,
"grad_norm": 0.04949653596283253,
"learning_rate": 1.6054205351265784e-07,
"loss": 0.0021,
"step": 1292
},
{
"epoch": 4.92,
"grad_norm": 0.06054720833614147,
"learning_rate": 1.4688870147473443e-07,
"loss": 0.0011,
"step": 1293
},
{
"epoch": 4.92,
"grad_norm": 0.02434476920884362,
"learning_rate": 1.338415840900109e-07,
"loss": 0.0007,
"step": 1294
},
{
"epoch": 4.92,
"grad_norm": 0.04425200814298354,
"learning_rate": 1.2140078057101266e-07,
"loss": 0.001,
"step": 1295
},
{
"epoch": 4.93,
"grad_norm": 0.08400768778349636,
"learning_rate": 1.0956636644912088e-07,
"loss": 0.0015,
"step": 1296
},
{
"epoch": 4.93,
"grad_norm": 0.01681668533811899,
"learning_rate": 9.833841357421714e-08,
"loss": 0.0006,
"step": 1297
},
{
"epoch": 4.94,
"grad_norm": 0.06053068555064212,
"learning_rate": 8.771699011416168e-08,
"loss": 0.0019,
"step": 1298
},
{
"epoch": 4.94,
"grad_norm": 0.04317850317270507,
"learning_rate": 7.770216055443814e-08,
"loss": 0.0011,
"step": 1299
},
{
"epoch": 4.94,
"grad_norm": 0.08586897126139875,
"learning_rate": 6.829398569770939e-08,
"loss": 0.001,
"step": 1300
},
{
"epoch": 4.94,
"eval_blimp_filtered_avg": 0.7153731343283583,
"eval_blimp_filtered_std": 0.005045209651150972,
"step": 1300
},
{
"epoch": 4.94,
"eval_blimp_supplement_avg": 0.834051724137931,
"eval_blimp_supplement_std": 0.016654601081150626,
"step": 1300
},
{
"epoch": 4.94,
"eval_vqa_filtered_avg": 0.5,
"eval_vqa_filtered_std": 0.050251890762960605,
"step": 1300
},
{
"epoch": 4.94,
"eval_winoground_filtered_avg": 0.69,
"eval_winoground_filtered_std": 0.04648231987117316,
"step": 1300
},
{
"epoch": 4.95,
"grad_norm": 0.016208779457975556,
"learning_rate": 5.94925226635068e-08,
"loss": 0.0006,
"step": 1301
},
{
"epoch": 4.95,
"grad_norm": 0.01833907339793384,
"learning_rate": 5.1297824887841516e-08,
"loss": 0.0006,
"step": 1302
},
{
"epoch": 4.95,
"grad_norm": 0.036830940713068985,
"learning_rate": 4.370994212287149e-08,
"loss": 0.001,
"step": 1303
},
{
"epoch": 4.96,
"grad_norm": 0.015645769111424807,
"learning_rate": 3.672892043666831e-08,
"loss": 0.0006,
"step": 1304
},
{
"epoch": 4.96,
"grad_norm": 0.07940082249502824,
"learning_rate": 3.0354802212839705e-08,
"loss": 0.0011,
"step": 1305
},
{
"epoch": 4.97,
"grad_norm": 0.015159814855124942,
"learning_rate": 2.4587626150351926e-08,
"loss": 0.0005,
"step": 1306
},
{
"epoch": 4.97,
"grad_norm": 0.022458919214575546,
"learning_rate": 1.94274272632633e-08,
"loss": 0.001,
"step": 1307
},
{
"epoch": 4.97,
"grad_norm": 0.02980953425255622,
"learning_rate": 1.4874236880491055e-08,
"loss": 0.0008,
"step": 1308
},
{
"epoch": 4.98,
"grad_norm": 0.023911967525172478,
"learning_rate": 1.0928082645667025e-08,
"loss": 0.0007,
"step": 1309
},
{
"epoch": 4.98,
"grad_norm": 0.01333819056514401,
"learning_rate": 7.58898851693779e-09,
"loss": 0.0005,
"step": 1310
},
{
"epoch": 4.98,
"grad_norm": 0.028702718224002614,
"learning_rate": 4.856974766831446e-09,
"loss": 0.0008,
"step": 1311
},
{
"epoch": 4.99,
"grad_norm": 0.022211928652402655,
"learning_rate": 2.732057982124392e-09,
"loss": 0.0006,
"step": 1312
},
{
"epoch": 4.99,
"grad_norm": 0.021399146337567086,
"learning_rate": 1.2142510637414006e-09,
"loss": 0.0008,
"step": 1313
},
{
"epoch": 5.0,
"grad_norm": 0.06401289450470597,
"learning_rate": 3.035632266890076e-10,
"loss": 0.0017,
"step": 1314
},
{
"epoch": 5.0,
"grad_norm": 0.10829033858276516,
"learning_rate": 0.0,
"loss": 0.0017,
"step": 1315
},
{
"epoch": 5.0,
"step": 1315,
"total_flos": 76761234800640.0,
"train_loss": 0.24336853736315273,
"train_runtime": 20724.9349,
"train_samples_per_second": 16.232,
"train_steps_per_second": 0.063
}
],
"logging_steps": 1.0,
"max_steps": 1315,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 76761234800640.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}