carinnew commited on
Commit
79889d8
1 Parent(s): 2453b9e

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 1280,
14
+ "n_head": 20,
15
+ "n_inner": null,
16
+ "n_layer": 36,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "transformers_version": "4.36.0",
28
+ "use_cache": true,
29
+ "vocab_size": 50257
30
+ }
epoch_losses.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train_log": [{"learning_rate": 2.9999999242136255e-05, "loss": 6.8986382484436035, "step": 500}, {"learning_rate": 2.9962950065964833e-05, "loss": 6.098959445953369, "step": 1000}, {"learning_rate": 2.9925902708782814e-05, "loss": 5.757495403289795, "step": 1500}, {"learning_rate": 2.9888853532611392e-05, "loss": 5.713656425476074, "step": 2000}, {"learning_rate": 2.985180435643997e-05, "loss": 5.3112335205078125, "step": 2500}, {"learning_rate": 2.981475518026855e-05, "loss": 5.326865196228027, "step": 3000}, {"learning_rate": 2.9777706004097126e-05, "loss": 5.290707588195801, "step": 3500}, {"learning_rate": 2.9740658646915108e-05, "loss": 5.138245582580566, "step": 4000}, {"learning_rate": 2.9703609470743686e-05, "loss": 5.229057312011719, "step": 4500}, {"learning_rate": 2.9666562113561668e-05, "loss": 4.876111030578613, "step": 5000}, {"learning_rate": 2.9629511118400842e-05, "loss": 5.0490803718566895, "step": 5500}, {"learning_rate": 2.959246194222942e-05, "loss": 4.547306060791016, "step": 6000}, {"learning_rate": 2.95554145850474e-05, "loss": 4.496936321258545, "step": 6500}, {"learning_rate": 2.951836540887598e-05, "loss": 4.5053205490112305, "step": 7000}, {"learning_rate": 2.948131805169396e-05, "loss": 4.488462448120117, "step": 7500}, {"learning_rate": 2.944426887552254e-05, "loss": 4.52130126953125, "step": 8000}, {"learning_rate": 2.9407217880361713e-05, "loss": 4.5772552490234375, "step": 8500}, {"learning_rate": 2.9370170523179695e-05, "loss": 4.569002151489258, "step": 9000}, {"learning_rate": 2.9333121347008273e-05, "loss": 4.470548152923584, "step": 9500}, {"learning_rate": 2.9296073989826255e-05, "loss": 4.075364589691162, "step": 10000}, {"learning_rate": 2.9259024813654833e-05, "loss": 4.13009786605835, "step": 10500}, {"learning_rate": 2.922197563748341e-05, "loss": 4.204780101776123, "step": 11000}, {"learning_rate": 2.918492646131199e-05, "loss": 4.47165584564209, "step": 11500}, {"learning_rate": 2.9147877285140567e-05, "loss": 3.9254674911499023, "step": 12000}, {"learning_rate": 2.9110829927958548e-05, "loss": 4.262510299682617, "step": 12500}, {"learning_rate": 2.9073780751787126e-05, "loss": 3.8191840648651123, "step": 13000}, {"learning_rate": 2.9036731575615704e-05, "loss": 4.214046478271484, "step": 13500}, {"learning_rate": 2.8999684218433686e-05, "loss": 4.321501731872559, "step": 14000}, {"learning_rate": 2.8962635042262264e-05, "loss": 4.193140029907227, "step": 14500}, {"learning_rate": 2.892558586609084e-05, "loss": 4.00380802154541, "step": 15000}, {"learning_rate": 2.888853668991942e-05, "loss": 4.031393051147461, "step": 15500}, {"learning_rate": 2.8851487513747998e-05, "loss": 4.04387092590332, "step": 16000}, {"learning_rate": 2.881444015656598e-05, "loss": 4.095037460327148, "step": 16500}, {"learning_rate": 2.8777390980394557e-05, "loss": 3.8619585037231445, "step": 17000}, {"learning_rate": 2.874034362321254e-05, "loss": 3.920173168182373, "step": 17500}, {"learning_rate": 2.8703292628051713e-05, "loss": 3.949486494064331, "step": 18000}, {"learning_rate": 2.866624345188029e-05, "loss": 4.103151798248291, "step": 18500}, {"learning_rate": 2.8629196094698273e-05, "loss": 4.218245983123779, "step": 19000}, {"learning_rate": 2.859214691852685e-05, "loss": 3.8536133766174316, "step": 19500}, {"learning_rate": 2.855509774235543e-05, "loss": 3.693816900253296, "step": 20000}, {"learning_rate": 2.851805038517341e-05, "loss": 3.9139347076416016, "step": 20500}, {"learning_rate": 2.8480999390012585e-05, "loss": 3.9494645595550537, "step": 21000}, {"learning_rate": 2.8443952032830566e-05, "loss": 3.4718356132507324, "step": 21500}, {"learning_rate": 2.8406902856659144e-05, "loss": 3.6764211654663086, "step": 22000}, {"learning_rate": 2.8369853680487722e-05, "loss": 3.7224197387695312, "step": 22500}, {"learning_rate": 2.8332806323305704e-05, "loss": 3.969146490097046, "step": 23000}, {"learning_rate": 2.8295757147134282e-05, "loss": 3.8573853969573975, "step": 23500}, {"learning_rate": 2.8258709789952263e-05, "loss": 3.787998676300049, "step": 24000}, {"learning_rate": 2.8221658794791438e-05, "loss": 3.437772512435913, "step": 24500}, {"learning_rate": 2.8184609618620016e-05, "loss": 3.9223814010620117, "step": 25000}, {"learning_rate": 2.8147562261437997e-05, "loss": 3.773622989654541, "step": 25500}, {"learning_rate": 2.8110513085266575e-05, "loss": 3.382697582244873, "step": 26000}, {"learning_rate": 2.8073465728084557e-05, "loss": 3.758333683013916, "step": 26500}, {"learning_rate": 2.803641473292373e-05, "loss": 3.4451656341552734, "step": 27000}, {"learning_rate": 2.799936555675231e-05, "loss": 3.478599786758423, "step": 27500}, {"learning_rate": 2.796231819957029e-05, "loss": 3.462559700012207, "step": 28000}, {"learning_rate": 2.792526902339887e-05, "loss": 3.6022117137908936, "step": 28500}, {"learning_rate": 2.788822166621685e-05, "loss": 3.387537717819214, "step": 29000}, {"learning_rate": 2.7851172490045428e-05, "loss": 4.005965709686279, "step": 29500}, {"learning_rate": 2.7814123313874006e-05, "loss": 3.670732021331787, "step": 30000}, {"learning_rate": 2.7777074137702584e-05, "loss": 3.8734958171844482, "step": 30500}, {"learning_rate": 2.7740024961531162e-05, "loss": 3.8407154083251953, "step": 31000}, {"learning_rate": 2.7702977604349144e-05, "loss": 3.2580385208129883, "step": 31500}, {"learning_rate": 2.7665928428177722e-05, "loss": 3.4980108737945557, "step": 32000}, {"learning_rate": 2.76288792520063e-05, "loss": 3.83961820602417, "step": 32500}, {"learning_rate": 2.759183189482428e-05, "loss": 3.6410164833068848, "step": 33000}, {"learning_rate": 2.7554780899663456e-05, "loss": 3.7315235137939453, "step": 33500}, {"learning_rate": 2.7517733542481437e-05, "loss": 3.72507381439209, "step": 34000}, {"learning_rate": 2.7480684366310015e-05, "loss": 3.5465457439422607, "step": 34500}, {"learning_rate": 2.7443635190138593e-05, "loss": 3.6483407020568848, "step": 35000}, {"learning_rate": 2.7406587832956575e-05, "loss": 3.8554999828338623, "step": 35500}, {"learning_rate": 2.7369538656785153e-05, "loss": 3.527710437774658, "step": 36000}, {"learning_rate": 2.733248948061373e-05, "loss": 3.7118029594421387, "step": 36500}, {"learning_rate": 2.729544030444231e-05, "loss": 3.4001858234405518, "step": 37000}, {"learning_rate": 2.7258391128270887e-05, "loss": 3.595517158508301, "step": 37500}, {"learning_rate": 2.722134377108887e-05, "loss": 3.73433780670166, "step": 38000}, {"learning_rate": 2.7184294594917446e-05, "loss": 3.476926326751709, "step": 38500}, {"learning_rate": 2.7147247237735428e-05, "loss": 3.4273738861083984, "step": 39000}, {"learning_rate": 2.7110198061564006e-05, "loss": 3.851701259613037, "step": 39500}, {"learning_rate": 2.707314706640318e-05, "loss": 3.620213508605957, "step": 40000}, {"learning_rate": 2.7036099709221162e-05, "loss": 3.5032501220703125, "step": 40500}, {"learning_rate": 2.699905053304974e-05, "loss": 3.4897947311401367, "step": 41000}, {"learning_rate": 2.696200317586772e-05, "loss": 3.2852978706359863, "step": 41500}, {"learning_rate": 2.69249539996963e-05, "loss": 3.258441925048828, "step": 42000}, {"learning_rate": 2.6887903004535474e-05, "loss": 3.613506317138672, "step": 42500}, {"learning_rate": 2.6850855647353455e-05, "loss": 3.712674856185913, "step": 43000}, {"learning_rate": 2.6813806471182033e-05, "loss": 3.454590320587158, "step": 43500}, {"learning_rate": 2.6776759114000015e-05, "loss": 3.492033004760742, "step": 44000}, {"learning_rate": 2.6739709937828593e-05, "loss": 3.0745458602905273, "step": 44500}, {"learning_rate": 2.670266076165717e-05, "loss": 3.2017054557800293, "step": 45000}, {"learning_rate": 2.6665613404475152e-05, "loss": 3.4778892993927, "step": 45500}, {"learning_rate": 2.6628562409314327e-05, "loss": 3.2931582927703857, "step": 46000}, {"learning_rate": 2.659151505213231e-05, "loss": 3.732515335083008, "step": 46500}, {"learning_rate": 2.6554465875960886e-05, "loss": 3.439234972000122, "step": 47000}, {"learning_rate": 2.6517416699789464e-05, "loss": 3.46848464012146, "step": 47500}, {"learning_rate": 2.6480369342607446e-05, "loss": 3.492138147354126, "step": 48000}, {"learning_rate": 2.6443320166436024e-05, "loss": 3.349658250808716, "step": 48500}, {"learning_rate": 2.6406272809254006e-05, "loss": 3.4468629360198975, "step": 49000}, {"learning_rate": 2.636922181409318e-05, "loss": 3.447221279144287, "step": 49500}, {"learning_rate": 2.6332172637921758e-05, "loss": 3.5180749893188477, "step": 50000}, {"learning_rate": 2.629512528073974e-05, "loss": 3.261063575744629, "step": 50500}, {"learning_rate": 2.6258076104568318e-05, "loss": 3.5615758895874023, "step": 51000}, {"learning_rate": 2.62210287473863e-05, "loss": 3.355055570602417, "step": 51500}, {"learning_rate": 2.6183977752225474e-05, "loss": 3.4640753269195557, "step": 52000}, {"learning_rate": 2.614692857605405e-05, "loss": 3.126800537109375, "step": 52500}, {"learning_rate": 2.6109881218872033e-05, "loss": 3.2161364555358887, "step": 53000}, {"learning_rate": 2.607283204270061e-05, "loss": 3.485745906829834, "step": 53500}, {"learning_rate": 2.6035784685518593e-05, "loss": 3.273010730743408, "step": 54000}, {"learning_rate": 2.599873550934717e-05, "loss": 3.5718767642974854, "step": 54500}, {"learning_rate": 2.596168633317575e-05, "loss": 3.478156566619873, "step": 55000}, {"learning_rate": 2.5924637157004327e-05, "loss": 3.506965398788452, "step": 55500}, {"learning_rate": 2.5887587980832905e-05, "loss": 3.787013530731201, "step": 56000}, {"learning_rate": 2.5850540623650886e-05, "loss": 3.5410375595092773, "step": 56500}, {"learning_rate": 2.5813491447479464e-05, "loss": 3.572726249694824, "step": 57000}, {"learning_rate": 2.5776442271308042e-05, "loss": 3.415219783782959, "step": 57500}, {"learning_rate": 2.573939309513662e-05, "loss": 3.2402899265289307, "step": 58000}, {"learning_rate": 2.57023457379546e-05, "loss": 3.5043466091156006, "step": 58500}, {"learning_rate": 2.566529656178318e-05, "loss": 3.378573417663574, "step": 59000}, {"learning_rate": 2.5628247385611758e-05, "loss": 3.540923595428467, "step": 59500}, {"learning_rate": 2.5591198209440336e-05, "loss": 3.646481513977051, "step": 60000}, {"learning_rate": 2.5554150852258317e-05, "loss": 3.3717563152313232, "step": 60500}, {"learning_rate": 2.5517101676086895e-05, "loss": 3.3596413135528564, "step": 61000}, {"learning_rate": 2.548005068092607e-05, "loss": 3.4434683322906494, "step": 61500}, {"learning_rate": 2.544300332374405e-05, "loss": 3.2643415927886963, "step": 62000}, {"learning_rate": 2.540595414757263e-05, "loss": 3.3425939083099365, "step": 62500}, {"learning_rate": 2.536890679039061e-05, "loss": 3.4325740337371826, "step": 63000}, {"learning_rate": 2.533185761421919e-05, "loss": 3.5082356929779053, "step": 63500}, {"learning_rate": 2.5294808438047767e-05, "loss": 3.356461524963379, "step": 64000}, {"learning_rate": 2.5257761080865748e-05, "loss": 3.3713481426239014, "step": 64500}, {"learning_rate": 2.5220710085704923e-05, "loss": 3.336869239807129, "step": 65000}, {"learning_rate": 2.5183662728522904e-05, "loss": 3.4301161766052246, "step": 65500}, {"learning_rate": 2.5146613552351482e-05, "loss": 3.3328285217285156, "step": 66000}, {"learning_rate": 2.510956437618006e-05, "loss": 3.570180892944336, "step": 66500}, {"learning_rate": 2.5072517018998042e-05, "loss": 3.2962236404418945, "step": 67000}, {"learning_rate": 2.5035466023837216e-05, "loss": 3.465270519256592, "step": 67500}, {"learning_rate": 2.49984204856446e-05, "loss": 3.415229320526123, "step": 68000}, {"learning_rate": 2.4961369490483776e-05, "loss": 3.0241405963897705, "step": 68500}, {"learning_rate": 2.4924320314312354e-05, "loss": 3.3958702087402344, "step": 69000}, {"learning_rate": 2.4887272957130335e-05, "loss": 3.5552518367767334, "step": 69500}, {"learning_rate": 2.4850223780958913e-05, "loss": 3.4421608448028564, "step": 70000}, {"learning_rate": 2.4813176423776895e-05, "loss": 3.4355344772338867, "step": 70500}, {"learning_rate": 2.477612542861607e-05, "loss": 3.3843164443969727, "step": 71000}, {"learning_rate": 2.4739076252444647e-05, "loss": 3.403310775756836, "step": 71500}, {"learning_rate": 2.470202889526263e-05, "loss": 3.1209683418273926, "step": 72000}, {"learning_rate": 2.4664979719091207e-05, "loss": 3.525054931640625, "step": 72500}, {"learning_rate": 2.462793236190919e-05, "loss": 3.5435166358947754, "step": 73000}, {"learning_rate": 2.4590883185737766e-05, "loss": 3.277332305908203, "step": 73500}, {"learning_rate": 2.4553834009566344e-05, "loss": 3.4398224353790283, "step": 74000}, {"learning_rate": 2.4516784833394922e-05, "loss": 3.2127737998962402, "step": 74500}, {"learning_rate": 2.44797356572235e-05, "loss": 3.2365894317626953, "step": 75000}, {"learning_rate": 2.4442688300041482e-05, "loss": 3.2364091873168945, "step": 75500}, {"learning_rate": 2.440563912387006e-05, "loss": 3.3636703491210938, "step": 76000}, {"learning_rate": 2.4368589947698638e-05, "loss": 3.419970989227295, "step": 76500}, {"learning_rate": 2.4331540771527216e-05, "loss": 3.2975990772247314, "step": 77000}, {"learning_rate": 2.4294491595355794e-05, "loss": 3.183743953704834, "step": 77500}, {"learning_rate": 2.4257444238173775e-05, "loss": 2.95817494392395, "step": 78000}, {"learning_rate": 2.4220395062002353e-05, "loss": 3.450798273086548, "step": 78500}, {"learning_rate": 2.418334588583093e-05, "loss": 3.1244864463806152, "step": 79000}, {"learning_rate": 2.4146298528648913e-05, "loss": 3.4584126472473145, "step": 79500}, {"learning_rate": 2.410924935247749e-05, "loss": 3.2099857330322266, "step": 80000}, {"learning_rate": 2.407220017630607e-05, "loss": 3.5543031692504883, "step": 80500}, {"learning_rate": 2.4035151000134647e-05, "loss": 3.216042995452881, "step": 81000}, {"learning_rate": 2.3998101823963225e-05, "loss": 3.207099437713623, "step": 81500}, {"learning_rate": 2.3961054466781206e-05, "loss": 3.069584846496582, "step": 82000}, {"learning_rate": 2.3924005290609784e-05, "loss": 3.30985951423645, "step": 82500}, {"learning_rate": 2.3886956114438362e-05, "loss": 3.3430368900299072, "step": 83000}, {"learning_rate": 2.3849908757256344e-05, "loss": 3.14524507522583, "step": 83500}, {"learning_rate": 2.381285776209552e-05, "loss": 2.9453845024108887, "step": 84000}, {"learning_rate": 2.37758104049135e-05, "loss": 3.3833494186401367, "step": 84500}, {"learning_rate": 2.3738761228742078e-05, "loss": 2.9838719367980957, "step": 85000}, {"learning_rate": 2.370171387156006e-05, "loss": 3.179948568344116, "step": 85500}, {"learning_rate": 2.3664664695388637e-05, "loss": 3.0406341552734375, "step": 86000}, {"learning_rate": 2.3627613700227812e-05, "loss": 2.930403709411621, "step": 86500}, {"learning_rate": 2.3590566343045793e-05, "loss": 3.0872411727905273, "step": 87000}, {"learning_rate": 2.355351716687437e-05, "loss": 3.261713743209839, "step": 87500}, {"learning_rate": 2.3516469809692353e-05, "loss": 3.1617984771728516, "step": 88000}, {"learning_rate": 2.347942063352093e-05, "loss": 3.2365665435791016, "step": 88500}, {"learning_rate": 2.344237145734951e-05, "loss": 3.2366750240325928, "step": 89000}, {"learning_rate": 2.340532410016749e-05, "loss": 3.1727702617645264, "step": 89500}, {"learning_rate": 2.3368273105006665e-05, "loss": 3.515125036239624, "step": 90000}, {"learning_rate": 2.3331225747824647e-05, "loss": 3.3270559310913086, "step": 90500}, {"learning_rate": 2.3294176571653225e-05, "loss": 3.280552864074707, "step": 91000}, {"learning_rate": 2.3257127395481803e-05, "loss": 3.127174139022827, "step": 91500}, {"learning_rate": 2.3220080038299784e-05, "loss": 3.192018747329712, "step": 92000}, {"learning_rate": 2.318302904313896e-05, "loss": 3.3159713745117188, "step": 92500}, {"learning_rate": 2.3145983504946344e-05, "loss": 3.159844398498535, "step": 93000}, {"learning_rate": 2.3108932509785518e-05, "loss": 3.4407405853271484, "step": 93500}, {"learning_rate": 2.3071883333614096e-05, "loss": 3.107201099395752, "step": 94000}, {"learning_rate": 2.3034835976432078e-05, "loss": 3.4651193618774414, "step": 94500}, {"learning_rate": 2.2997786800260656e-05, "loss": 3.2810723781585693, "step": 95000}, {"learning_rate": 2.2960739443078637e-05, "loss": 3.1676461696624756, "step": 95500}, {"learning_rate": 2.292368844791781e-05, "loss": 3.3465471267700195, "step": 96000}, {"learning_rate": 2.288663927174639e-05, "loss": 3.1134495735168457, "step": 96500}, {"learning_rate": 2.284959191456437e-05, "loss": 3.0729260444641113, "step": 97000}, {"learning_rate": 2.281254273839295e-05, "loss": 3.030113935470581, "step": 97500}, {"learning_rate": 2.277549538121093e-05, "loss": 3.1528491973876953, "step": 98000}, {"learning_rate": 2.2738444386050105e-05, "loss": 3.179975748062134, "step": 98500}, {"learning_rate": 2.2701397028868087e-05, "loss": 3.1880366802215576, "step": 99000}, {"learning_rate": 2.2664347852696665e-05, "loss": 3.4239392280578613, "step": 99500}, {"learning_rate": 2.2627298676525243e-05, "loss": 3.0252604484558105, "step": 100000}, {"learning_rate": 2.259024950035382e-05, "loss": 3.2128806114196777, "step": 100500}, {"learning_rate": 2.2553202143171802e-05, "loss": 3.2896347045898438, "step": 101000}, {"learning_rate": 2.251615296700038e-05, "loss": 3.0912246704101562, "step": 101500}, {"learning_rate": 2.2479103790828958e-05, "loss": 3.2715060710906982, "step": 102000}, {"learning_rate": 2.244205643364694e-05, "loss": 3.353846788406372, "step": 102500}, {"learning_rate": 2.2405005438486114e-05, "loss": 3.2600257396698, "step": 103000}, {"learning_rate": 2.2367958081304096e-05, "loss": 3.030548572540283, "step": 103500}, {"learning_rate": 2.2330908905132674e-05, "loss": 3.2392475605010986, "step": 104000}, {"learning_rate": 2.229385972896125e-05, "loss": 3.3271517753601074, "step": 104500}, {"learning_rate": 2.2256812371779233e-05, "loss": 3.291137933731079, "step": 105000}, {"learning_rate": 2.2219761376618408e-05, "loss": 2.9485490322113037, "step": 105500}, {"learning_rate": 2.218271401943639e-05, "loss": 3.0812811851501465, "step": 106000}, {"learning_rate": 2.2145664843264967e-05, "loss": 3.1856322288513184, "step": 106500}, {"learning_rate": 2.210861748608295e-05, "loss": 3.5202741622924805, "step": 107000}, {"learning_rate": 2.2071568309911527e-05, "loss": 3.232081890106201, "step": 107500}, {"learning_rate": 2.20345173147507e-05, "loss": 3.094364881515503, "step": 108000}, {"learning_rate": 2.1997471776558086e-05, "loss": 3.1236019134521484, "step": 108500}, {"learning_rate": 2.196042078139726e-05, "loss": 3.1682116985321045, "step": 109000}, {"learning_rate": 2.1923373424215242e-05, "loss": 3.2314059734344482, "step": 109500}, {"learning_rate": 2.188632424804382e-05, "loss": 3.4607980251312256, "step": 110000}, {"learning_rate": 2.1849275071872398e-05, "loss": 3.391063690185547, "step": 110500}, {"learning_rate": 2.181222771469038e-05, "loss": 3.151228904724121, "step": 111000}, {"learning_rate": 2.1775176719529554e-05, "loss": 3.1468565464019775, "step": 111500}, {"learning_rate": 2.173813118133694e-05, "loss": 3.026545286178589, "step": 112000}, {"learning_rate": 2.1701080186176114e-05, "loss": 3.2258126735687256, "step": 112500}, {"learning_rate": 2.1664031010004692e-05, "loss": 3.2126312255859375, "step": 113000}, {"learning_rate": 2.1626983652822673e-05, "loss": 3.278456211090088, "step": 113500}, {"learning_rate": 2.1589932657661848e-05, "loss": 3.323538303375244, "step": 114000}, {"learning_rate": 2.1552887119469233e-05, "loss": 3.171614170074463, "step": 114500}, {"learning_rate": 2.1515836124308407e-05, "loss": 3.255587339401245, "step": 115000}, {"learning_rate": 2.1478786948136985e-05, "loss": 3.2740793228149414, "step": 115500}, {"learning_rate": 2.1441739590954967e-05, "loss": 3.3412387371063232, "step": 116000}, {"learning_rate": 2.1404690414783545e-05, "loss": 3.147299289703369, "step": 116500}, {"learning_rate": 2.1367643057601526e-05, "loss": 3.0748326778411865, "step": 117000}, {"learning_rate": 2.13305920624407e-05, "loss": 3.2099809646606445, "step": 117500}, {"learning_rate": 2.1293544705258682e-05, "loss": 3.244529962539673, "step": 118000}, {"learning_rate": 2.125649552908726e-05, "loss": 3.2175166606903076, "step": 118500}, {"learning_rate": 2.121944635291584e-05, "loss": 3.1752829551696777, "step": 119000}, {"learning_rate": 2.118239899573382e-05, "loss": 3.0201520919799805, "step": 119500}, {"learning_rate": 2.1145348000572994e-05, "loss": 3.1492464542388916, "step": 120000}, {"learning_rate": 2.1108300643390976e-05, "loss": 3.1406359672546387, "step": 120500}, {"learning_rate": 2.1071251467219554e-05, "loss": 3.0605740547180176, "step": 121000}, {"learning_rate": 2.1034204110037535e-05, "loss": 3.175384521484375, "step": 121500}, {"learning_rate": 2.0997154933866113e-05, "loss": 3.2910733222961426, "step": 122000}, {"learning_rate": 2.096010575769469e-05, "loss": 3.3637561798095703, "step": 122500}, {"learning_rate": 2.092305658152327e-05, "loss": 3.099407196044922, "step": 123000}, {"learning_rate": 2.0886007405351847e-05, "loss": 3.345130443572998, "step": 123500}, {"learning_rate": 2.084896004816983e-05, "loss": 3.287017345428467, "step": 124000}, {"learning_rate": 2.0811910871998407e-05, "loss": 3.279244899749756, "step": 124500}, {"learning_rate": 2.0774861695826985e-05, "loss": 3.1920812129974365, "step": 125000}, {"learning_rate": 2.0737812519655563e-05, "loss": 3.451967716217041, "step": 125500}, {"learning_rate": 2.0700765162473544e-05, "loss": 3.337390899658203, "step": 126000}, {"learning_rate": 2.0663715986302122e-05, "loss": 3.4011178016662598, "step": 126500}, {"learning_rate": 2.06266668101307e-05, "loss": 3.2147862911224365, "step": 127000}, {"learning_rate": 2.0589619452948682e-05, "loss": 3.0983567237854004, "step": 127500}, {"learning_rate": 2.0552568457787856e-05, "loss": 2.922168731689453, "step": 128000}, {"learning_rate": 2.0515521100605838e-05, "loss": 3.285867691040039, "step": 128500}, {"learning_rate": 2.0478471924434416e-05, "loss": 3.2200870513916016, "step": 129000}, {"learning_rate": 2.0441422748262994e-05, "loss": 3.4233288764953613, "step": 129500}, {"learning_rate": 2.0404375391080976e-05, "loss": 3.106132745742798, "step": 130000}, {"learning_rate": 2.036732439592015e-05, "loss": 2.9383089542388916, "step": 130500}, {"learning_rate": 2.033027703873813e-05, "loss": 3.260836601257324, "step": 131000}, {"learning_rate": 2.029322786256671e-05, "loss": 3.3277111053466797, "step": 131500}, {"learning_rate": 2.025618050538469e-05, "loss": 3.0078885555267334, "step": 132000}, {"learning_rate": 2.021913132921327e-05, "loss": 3.183234930038452, "step": 132500}, {"learning_rate": 2.0182080334052444e-05, "loss": 3.1755008697509766, "step": 133000}, {"learning_rate": 2.014503479585983e-05, "loss": 2.9777884483337402, "step": 133500}, {"learning_rate": 2.0107983800699003e-05, "loss": 3.2220592498779297, "step": 134000}, {"learning_rate": 2.0070936443516985e-05, "loss": 3.3772478103637695, "step": 134500}, {"learning_rate": 2.0033887267345563e-05, "loss": 3.0962066650390625, "step": 135000}, {"learning_rate": 1.999683809117414e-05, "loss": 3.39721941947937, "step": 135500}, {"learning_rate": 1.9959790733992122e-05, "loss": 3.0452966690063477, "step": 136000}, {"learning_rate": 1.9922739738831297e-05, "loss": 3.0959110260009766, "step": 136500}, {"learning_rate": 1.9885692381649278e-05, "loss": 2.9645233154296875, "step": 137000}, {"learning_rate": 1.9848643205477856e-05, "loss": 3.1262292861938477, "step": 137500}, {"learning_rate": 1.9811594029306434e-05, "loss": 3.347893714904785, "step": 138000}, {"learning_rate": 1.9774546672124416e-05, "loss": 3.1925177574157715, "step": 138500}, {"learning_rate": 1.973749567696359e-05, "loss": 3.290494918823242, "step": 139000}, {"learning_rate": 1.970044831978157e-05, "loss": 3.3515443801879883, "step": 139500}, {"learning_rate": 1.966339914361015e-05, "loss": 3.1701760292053223, "step": 140000}, {"learning_rate": 1.9626349967438728e-05, "loss": 3.2979564666748047, "step": 140500}, {"learning_rate": 1.958930261025671e-05, "loss": 3.2812719345092773, "step": 141000}, {"learning_rate": 1.9552253434085287e-05, "loss": 3.0737948417663574, "step": 141500}, {"learning_rate": 1.9515204257913865e-05, "loss": 2.9651243686676025, "step": 142000}, {"learning_rate": 1.9478155081742443e-05, "loss": 3.083522319793701, "step": 142500}, {"learning_rate": 1.9441107724560425e-05, "loss": 2.8407459259033203, "step": 143000}, {"learning_rate": 1.9404058548389003e-05, "loss": 3.084069013595581, "step": 143500}, {"learning_rate": 1.936700937221758e-05, "loss": 3.1204752922058105, "step": 144000}, {"learning_rate": 1.932996019604616e-05, "loss": 3.3654606342315674, "step": 144500}, {"learning_rate": 1.9292911019874737e-05, "loss": 2.8602585792541504, "step": 145000}, {"learning_rate": 1.9255863662692718e-05, "loss": 3.259446859359741, "step": 145500}, {"learning_rate": 1.9218814486521296e-05, "loss": 3.0700788497924805, "step": 146000}, {"learning_rate": 1.9181767129339278e-05, "loss": 3.042152166366577, "step": 146500}, {"learning_rate": 1.9144716134178452e-05, "loss": 3.1057064533233643, "step": 147000}, {"learning_rate": 1.9107668776996434e-05, "loss": 3.1597046852111816, "step": 147500}, {"learning_rate": 1.9070619600825012e-05, "loss": 3.016183376312256, "step": 148000}, {"learning_rate": 1.903357042465359e-05, "loss": 3.1588215827941895, "step": 148500}, {"learning_rate": 1.899652306747157e-05, "loss": 3.1214277744293213, "step": 149000}, {"learning_rate": 1.8959472072310746e-05, "loss": 3.0016119480133057, "step": 149500}, {"learning_rate": 1.8922424715128727e-05, "loss": 3.286888360977173, "step": 150000}, {"learning_rate": 1.8885375538957305e-05, "loss": 3.173213481903076, "step": 150500}, {"learning_rate": 1.8848328181775287e-05, "loss": 3.126720905303955, "step": 151000}, {"learning_rate": 1.8811279005603865e-05, "loss": 3.1332039833068848, "step": 151500}, {"learning_rate": 1.877422801044304e-05, "loss": 2.9429285526275635, "step": 152000}, {"learning_rate": 1.8737182472250424e-05, "loss": 3.166132926940918, "step": 152500}, {"learning_rate": 1.87001314770896e-05, "loss": 3.1142890453338623, "step": 153000}, {"learning_rate": 1.866308411990758e-05, "loss": 2.94692325592041, "step": 153500}, {"learning_rate": 1.862603494373616e-05, "loss": 3.2797868251800537, "step": 154000}, {"learning_rate": 1.8588983948575333e-05, "loss": 3.204414129257202, "step": 154500}, {"learning_rate": 1.8551938410382718e-05, "loss": 3.16141939163208, "step": 155000}, {"learning_rate": 1.8514887415221892e-05, "loss": 2.9320361614227295, "step": 155500}, {"learning_rate": 1.8477840058039874e-05, "loss": 3.1594388484954834, "step": 156000}, {"learning_rate": 1.8440790881868452e-05, "loss": 3.158543348312378, "step": 156500}, {"learning_rate": 1.840374170569703e-05, "loss": 3.1882879734039307, "step": 157000}, {"learning_rate": 1.836669434851501e-05, "loss": 3.0028510093688965, "step": 157500}, {"learning_rate": 1.8329643353354186e-05, "loss": 3.2081379890441895, "step": 158000}, {"learning_rate": 1.829259781516157e-05, "loss": 2.844841718673706, "step": 158500}, {"learning_rate": 1.8255546820000745e-05, "loss": 3.270986795425415, "step": 159000}, {"learning_rate": 1.8218497643829323e-05, "loss": 3.3302557468414307, "step": 159500}, {"learning_rate": 1.8181450286647305e-05, "loss": 3.153517484664917, "step": 160000}, {"learning_rate": 1.8144401110475883e-05, "loss": 2.8370399475097656, "step": 160500}, {"learning_rate": 1.8107353753293864e-05, "loss": 3.294719696044922, "step": 161000}, {"learning_rate": 1.807030275813304e-05, "loss": 2.802330255508423, "step": 161500}, {"learning_rate": 1.803325540095102e-05, "loss": 3.0937390327453613, "step": 162000}, {"learning_rate": 1.79962062247796e-05, "loss": 3.1610093116760254, "step": 162500}, {"learning_rate": 1.7959157048608176e-05, "loss": 2.808134078979492, "step": 163000}, {"learning_rate": 1.7922109691426158e-05, "loss": 3.111701488494873, "step": 163500}, {"learning_rate": 1.7885058696265332e-05, "loss": 2.828094244003296, "step": 164000}, {"learning_rate": 1.7848011339083314e-05, "loss": 3.14345645904541, "step": 164500}, {"learning_rate": 1.7810962162911892e-05, "loss": 3.101114273071289, "step": 165000}, {"learning_rate": 1.777391298674047e-05, "loss": 3.038419723510742, "step": 165500}, {"learning_rate": 1.773686562955845e-05, "loss": 3.0825843811035156, "step": 166000}, {"learning_rate": 1.769981645338703e-05, "loss": 3.156745433807373, "step": 166500}, {"learning_rate": 1.7662767277215607e-05, "loss": 3.2088940143585205, "step": 167000}, {"learning_rate": 1.7625718101044185e-05, "loss": 3.1528801918029785, "step": 167500}, {"learning_rate": 1.7588670743862167e-05, "loss": 2.8410651683807373, "step": 168000}, {"learning_rate": 1.7551621567690745e-05, "loss": 3.2338359355926514, "step": 168500}, {"learning_rate": 1.7514572391519323e-05, "loss": 3.1589131355285645, "step": 169000}, {"learning_rate": 1.74775232153479e-05, "loss": 3.1294631958007812, "step": 169500}, {"learning_rate": 1.744047403917648e-05, "loss": 3.073585033416748, "step": 170000}, {"learning_rate": 1.740342668199446e-05, "loss": 2.728564739227295, "step": 170500}, {"learning_rate": 1.736637750582304e-05, "loss": 3.020862579345703, "step": 171000}, {"learning_rate": 1.732933014864102e-05, "loss": 2.827536106109619, "step": 171500}, {"learning_rate": 1.7292279153480195e-05, "loss": 2.924298048019409, "step": 172000}, {"learning_rate": 1.7255231796298176e-05, "loss": 3.059899091720581, "step": 172500}, {"learning_rate": 1.7218182620126754e-05, "loss": 3.0385708808898926, "step": 173000}, {"learning_rate": 1.7181133443955332e-05, "loss": 3.12054443359375, "step": 173500}, {"learning_rate": 1.7144086086773314e-05, "loss": 2.9774441719055176, "step": 174000}, {"learning_rate": 1.7107035091612488e-05, "loss": 2.9283859729766846, "step": 174500}, {"learning_rate": 1.706998773443047e-05, "loss": 3.3157923221588135, "step": 175000}, {"learning_rate": 1.7032938558259048e-05, "loss": 3.167236804962158, "step": 175500}, {"learning_rate": 1.699589120107703e-05, "loss": 2.9778709411621094, "step": 176000}, {"learning_rate": 1.6958842024905607e-05, "loss": 3.055723190307617, "step": 176500}, {"learning_rate": 1.692179102974478e-05, "loss": 3.176206350326538, "step": 177000}, {"learning_rate": 1.6884745491552167e-05, "loss": 3.3187689781188965, "step": 177500}, {"learning_rate": 1.684769449639134e-05, "loss": 2.781830310821533, "step": 178000}, {"learning_rate": 1.681064532021992e-05, "loss": 3.291243314743042, "step": 178500}, {"learning_rate": 1.67735979630379e-05, "loss": 3.136726140975952, "step": 179000}, {"learning_rate": 1.6736546967877075e-05, "loss": 2.9016270637512207, "step": 179500}, {"learning_rate": 1.669950142968446e-05, "loss": 2.9211506843566895, "step": 180000}, {"learning_rate": 1.6662450434523635e-05, "loss": 3.0946929454803467, "step": 180500}, {"learning_rate": 1.6625401258352213e-05, "loss": 3.098019599914551, "step": 181000}, {"learning_rate": 1.6588353901170194e-05, "loss": 3.1506919860839844, "step": 181500}, {"learning_rate": 1.6551304724998772e-05, "loss": 3.3445491790771484, "step": 182000}, {"learning_rate": 1.6514257367816754e-05, "loss": 3.121548891067505, "step": 182500}, {"learning_rate": 1.6477206372655928e-05, "loss": 2.9296517372131348, "step": 183000}, {"learning_rate": 1.644015901547391e-05, "loss": 2.8066041469573975, "step": 183500}, {"learning_rate": 1.6403109839302488e-05, "loss": 3.1366355419158936, "step": 184000}, {"learning_rate": 1.6366060663131066e-05, "loss": 3.0850439071655273, "step": 184500}, {"learning_rate": 1.6329013305949047e-05, "loss": 3.012803077697754, "step": 185000}, {"learning_rate": 1.6291964129777625e-05, "loss": 2.978450298309326, "step": 185500}, {"learning_rate": 1.6254914953606203e-05, "loss": 2.9665162563323975, "step": 186000}, {"learning_rate": 1.621786577743478e-05, "loss": 2.8545682430267334, "step": 186500}, {"learning_rate": 1.6180818420252763e-05, "loss": 3.0592448711395264, "step": 187000}, {"learning_rate": 1.614376924408134e-05, "loss": 3.2621569633483887, "step": 187500}, {"learning_rate": 1.610672006790992e-05, "loss": 3.029885768890381, "step": 188000}, {"learning_rate": 1.6069670891738497e-05, "loss": 2.7973103523254395, "step": 188500}, {"learning_rate": 1.6032621715567075e-05, "loss": 3.0693390369415283, "step": 189000}, {"learning_rate": 1.5995574358385056e-05, "loss": 3.144068479537964, "step": 189500}, {"learning_rate": 1.5958525182213634e-05, "loss": 3.2915592193603516, "step": 190000}, {"learning_rate": 1.5921476006042212e-05, "loss": 3.144946813583374, "step": 190500}, {"learning_rate": 1.588442682987079e-05, "loss": 3.1473228931427, "step": 191000}, {"learning_rate": 1.5847379472688772e-05, "loss": 3.1550893783569336, "step": 191500}, {"learning_rate": 1.581033029651735e-05, "loss": 3.117825508117676, "step": 192000}, {"learning_rate": 1.5773281120345928e-05, "loss": 3.186227321624756, "step": 192500}, {"learning_rate": 1.573623376316391e-05, "loss": 2.9980432987213135, "step": 193000}, {"learning_rate": 1.5699182768003084e-05, "loss": 2.9007415771484375, "step": 193500}, {"learning_rate": 1.5662135410821065e-05, "loss": 2.9248883724212646, "step": 194000}, {"learning_rate": 1.5625086234649643e-05, "loss": 3.1171798706054688, "step": 194500}, {"learning_rate": 1.558803705847822e-05, "loss": 2.9885401725769043, "step": 195000}, {"learning_rate": 1.5550989701296203e-05, "loss": 2.898653507232666, "step": 195500}, {"learning_rate": 1.5513938706135377e-05, "loss": 3.22615385055542, "step": 196000}, {"learning_rate": 1.547689134895336e-05, "loss": 3.2588343620300293, "step": 196500}, {"learning_rate": 1.5439842172781937e-05, "loss": 2.974750518798828, "step": 197000}, {"learning_rate": 1.540279481559992e-05, "loss": 2.9462814331054688, "step": 197500}, {"learning_rate": 1.5365745639428496e-05, "loss": 3.2097864151000977, "step": 198000}, {"learning_rate": 1.532869464426767e-05, "loss": 3.0194754600524902, "step": 198500}, {"learning_rate": 1.5291649106075056e-05, "loss": 2.962475299835205, "step": 199000}, {"learning_rate": 1.525459811091423e-05, "loss": 3.0340816974639893, "step": 199500}, {"learning_rate": 1.521754984423751e-05, "loss": 3.115537405014038, "step": 200000}, {"learning_rate": 1.518050157756079e-05, "loss": 3.2253212928771973, "step": 200500}, {"learning_rate": 1.514345331088407e-05, "loss": 3.0904855728149414, "step": 201000}, {"learning_rate": 1.510640504420735e-05, "loss": 2.9021413326263428, "step": 201500}, {"learning_rate": 1.5069354049046524e-05, "loss": 3.042985439300537, "step": 202000}, {"learning_rate": 1.5032307601359207e-05, "loss": 2.8822944164276123, "step": 202500}, {"learning_rate": 1.4995257515693083e-05, "loss": 2.8428149223327637, "step": 203000}, {"learning_rate": 1.4958209249016363e-05, "loss": 3.107590675354004, "step": 203500}, {"learning_rate": 1.4921160982339643e-05, "loss": 3.049959659576416, "step": 204000}, {"learning_rate": 1.4884111806168221e-05, "loss": 2.953403949737549, "step": 204500}, {"learning_rate": 1.48470635394915e-05, "loss": 2.857692241668701, "step": 205000}, {"learning_rate": 1.4810013453825377e-05, "loss": 3.1016647815704346, "step": 205500}, {"learning_rate": 1.4772965187148657e-05, "loss": 2.9294021129608154, "step": 206000}, {"learning_rate": 1.4735916920471936e-05, "loss": 3.211087703704834, "step": 206500}, {"learning_rate": 1.4698867744300514e-05, "loss": 3.20558762550354, "step": 207000}, {"learning_rate": 1.4661819477623794e-05, "loss": 3.1136608123779297, "step": 207500}, {"learning_rate": 1.4624771210947074e-05, "loss": 2.9256370067596436, "step": 208000}, {"learning_rate": 1.458772112528095e-05, "loss": 2.9466586112976074, "step": 208500}, {"learning_rate": 1.4550671949109528e-05, "loss": 3.005167007446289, "step": 209000}, {"learning_rate": 1.4513623682432808e-05, "loss": 2.7076120376586914, "step": 209500}, {"learning_rate": 1.4476575415756088e-05, "loss": 2.9420199394226074, "step": 210000}, {"learning_rate": 1.4439527149079368e-05, "loss": 2.9234609603881836, "step": 210500}, {"learning_rate": 1.4402478882402647e-05, "loss": 3.1731948852539062, "step": 211000}, {"learning_rate": 1.4365427887241822e-05, "loss": 3.1505517959594727, "step": 211500}, {"learning_rate": 1.4328379620565102e-05, "loss": 3.0839195251464844, "step": 212000}, {"learning_rate": 1.4291331353888381e-05, "loss": 3.173694133758545, "step": 212500}, {"learning_rate": 1.4254283087211661e-05, "loss": 3.1980838775634766, "step": 213000}, {"learning_rate": 1.421723482053494e-05, "loss": 2.8201732635498047, "step": 213500}, {"learning_rate": 1.4180185644363519e-05, "loss": 3.090770721435547, "step": 214000}, {"learning_rate": 1.4143135558697395e-05, "loss": 3.1012566089630127, "step": 214500}, {"learning_rate": 1.4106087292020675e-05, "loss": 2.812798500061035, "step": 215000}, {"learning_rate": 1.4069039025343955e-05, "loss": 3.1596150398254395, "step": 215500}, {"learning_rate": 1.4031990758667234e-05, "loss": 2.8929476737976074, "step": 216000}, {"learning_rate": 1.3994941582495812e-05, "loss": 2.6602635383605957, "step": 216500}, {"learning_rate": 1.3957893315819092e-05, "loss": 2.8652825355529785, "step": 217000}, {"learning_rate": 1.3920845049142372e-05, "loss": 2.9449541568756104, "step": 217500}, {"learning_rate": 1.3883794963476248e-05, "loss": 2.915107011795044, "step": 218000}, {"learning_rate": 1.3846746696799528e-05, "loss": 3.100043535232544, "step": 218500}, {"learning_rate": 1.3809697520628106e-05, "loss": 2.8705453872680664, "step": 219000}, {"learning_rate": 1.3772649253951386e-05, "loss": 3.043818473815918, "step": 219500}, {"learning_rate": 1.3735600987274665e-05, "loss": 3.109485149383545, "step": 220000}, {"learning_rate": 1.3698552720597945e-05, "loss": 3.1246745586395264, "step": 220500}, {"learning_rate": 1.3661502634931821e-05, "loss": 2.8280739784240723, "step": 221000}, {"learning_rate": 1.36244534587604e-05, "loss": 3.220452308654785, "step": 221500}, {"learning_rate": 1.3587405192083679e-05, "loss": 3.0886945724487305, "step": 222000}, {"learning_rate": 1.3550356925406959e-05, "loss": 2.997331380844116, "step": 222500}, {"learning_rate": 1.3513308658730239e-05, "loss": 3.12556791305542, "step": 223000}, {"learning_rate": 1.3476260392053518e-05, "loss": 2.9916863441467285, "step": 223500}, {"learning_rate": 1.3439209396892693e-05, "loss": 3.1682651042938232, "step": 224000}, {"learning_rate": 1.3402161130215973e-05, "loss": 3.080982208251953, "step": 224500}, {"learning_rate": 1.3365112863539252e-05, "loss": 2.9872114658355713, "step": 225000}, {"learning_rate": 1.3328064596862532e-05, "loss": 3.078572988510132, "step": 225500}, {"learning_rate": 1.3291016330185812e-05, "loss": 3.131373643875122, "step": 226000}, {"learning_rate": 1.325396715401439e-05, "loss": 3.2371582984924316, "step": 226500}, {"learning_rate": 1.3216917068348266e-05, "loss": 3.166215181350708, "step": 227000}, {"learning_rate": 1.3179868801671546e-05, "loss": 3.1015493869781494, "step": 227500}, {"learning_rate": 1.3142820534994826e-05, "loss": 3.135776996612549, "step": 228000}, {"learning_rate": 1.3105771358823404e-05, "loss": 3.1725881099700928, "step": 228500}, {"learning_rate": 1.3068723092146683e-05, "loss": 3.243429183959961, "step": 229000}, {"learning_rate": 1.3031674825469963e-05, "loss": 2.942131996154785, "step": 229500}, {"learning_rate": 1.2994626558793243e-05, "loss": 2.772965431213379, "step": 230000}, {"learning_rate": 1.295757647312712e-05, "loss": 2.868208408355713, "step": 230500}, {"learning_rate": 1.2920527296955697e-05, "loss": 3.0246996879577637, "step": 231000}, {"learning_rate": 1.2883479030278977e-05, "loss": 3.0900251865386963, "step": 231500}, {"learning_rate": 1.2846430763602257e-05, "loss": 2.992485284805298, "step": 232000}, {"learning_rate": 1.2809382496925537e-05, "loss": 2.89326548576355, "step": 232500}, {"learning_rate": 1.2772334230248816e-05, "loss": 2.9153892993927, "step": 233000}, {"learning_rate": 1.273528323508799e-05, "loss": 2.9673829078674316, "step": 233500}, {"learning_rate": 1.269823496841127e-05, "loss": 2.786701202392578, "step": 234000}, {"learning_rate": 1.266118670173455e-05, "loss": 3.098313093185425, "step": 234500}, {"learning_rate": 1.262413843505783e-05, "loss": 3.248845338821411, "step": 235000}, {"learning_rate": 1.258709016838111e-05, "loss": 2.7045295238494873, "step": 235500}, {"learning_rate": 1.2550040992209688e-05, "loss": 2.8771791458129883, "step": 236000}, {"learning_rate": 1.2512990906543564e-05, "loss": 2.9424986839294434, "step": 236500}, {"learning_rate": 1.2475942639866844e-05, "loss": 2.9747812747955322, "step": 237000}, {"learning_rate": 1.2438894373190124e-05, "loss": 3.044034242630005, "step": 237500}, {"learning_rate": 1.2401846106513403e-05, "loss": 3.166414976119995, "step": 238000}, {"learning_rate": 1.2364796930341981e-05, "loss": 2.915086269378662, "step": 238500}, {"learning_rate": 1.2327748663665261e-05, "loss": 3.0272390842437744, "step": 239000}, {"learning_rate": 1.2290698577999137e-05, "loss": 2.817207098007202, "step": 239500}, {"learning_rate": 1.2253650311322417e-05, "loss": 2.874995231628418, "step": 240000}, {"learning_rate": 1.2216602044645697e-05, "loss": 2.995901584625244, "step": 240500}, {"learning_rate": 1.2179552868474275e-05, "loss": 2.803889751434326, "step": 241000}, {"learning_rate": 1.2142504601797555e-05, "loss": 2.9140353202819824, "step": 241500}, {"learning_rate": 1.2105456335120834e-05, "loss": 3.030773162841797, "step": 242000}, {"learning_rate": 1.2068408068444114e-05, "loss": 2.7695300579071045, "step": 242500}, {"learning_rate": 1.203135798277799e-05, "loss": 2.886472463607788, "step": 243000}, {"learning_rate": 1.1994308806606568e-05, "loss": 3.006537675857544, "step": 243500}, {"learning_rate": 1.1957260539929848e-05, "loss": 3.1142640113830566, "step": 244000}, {"learning_rate": 1.1920212273253128e-05, "loss": 2.9694278240203857, "step": 244500}, {"learning_rate": 1.1883164006576408e-05, "loss": 3.058002471923828, "step": 245000}, {"learning_rate": 1.1846115739899687e-05, "loss": 2.9071271419525146, "step": 245500}, {"learning_rate": 1.1809064744738862e-05, "loss": 3.242793083190918, "step": 246000}, {"learning_rate": 1.1772016478062142e-05, "loss": 2.9690797328948975, "step": 246500}, {"learning_rate": 1.1734968211385421e-05, "loss": 3.02500319480896, "step": 247000}, {"learning_rate": 1.1697919944708701e-05, "loss": 3.164280891418457, "step": 247500}, {"learning_rate": 1.166087076853728e-05, "loss": 3.054781913757324, "step": 248000}, {"learning_rate": 1.1623822501860559e-05, "loss": 3.0713183879852295, "step": 248500}, {"learning_rate": 1.1586772416194435e-05, "loss": 2.9013776779174805, "step": 249000}, {"learning_rate": 1.1549724149517715e-05, "loss": 3.1340951919555664, "step": 249500}, {"learning_rate": 1.1512675882840995e-05, "loss": 2.9073238372802734, "step": 250000}, {"learning_rate": 1.1475626706669573e-05, "loss": 2.97891902923584, "step": 250500}, {"learning_rate": 1.1438578439992853e-05, "loss": 2.8549249172210693, "step": 251000}, {"learning_rate": 1.1401530173316132e-05, "loss": 2.9759631156921387, "step": 251500}, {"learning_rate": 1.1364480087650008e-05, "loss": 3.011758327484131, "step": 252000}, {"learning_rate": 1.1327431820973288e-05, "loss": 2.9509031772613525, "step": 252500}, {"learning_rate": 1.1290382644801866e-05, "loss": 2.9001266956329346, "step": 253000}, {"learning_rate": 1.1253334378125146e-05, "loss": 2.917956590652466, "step": 253500}, {"learning_rate": 1.1216286111448426e-05, "loss": 3.0352702140808105, "step": 254000}, {"learning_rate": 1.1179237844771706e-05, "loss": 3.07898211479187, "step": 254500}, {"learning_rate": 1.1142189578094985e-05, "loss": 3.19921612739563, "step": 255000}, {"learning_rate": 1.110513858293416e-05, "loss": 3.019921064376831, "step": 255500}, {"learning_rate": 1.106809031625744e-05, "loss": 3.023876905441284, "step": 256000}, {"learning_rate": 1.103104204958072e-05, "loss": 3.0721943378448486, "step": 256500}, {"learning_rate": 1.0993993782903999e-05, "loss": 2.9380221366882324, "step": 257000}, {"learning_rate": 1.0956945516227279e-05, "loss": 2.8010973930358887, "step": 257500}, {"learning_rate": 1.0919896340055857e-05, "loss": 3.1131911277770996, "step": 258000}, {"learning_rate": 1.0882846254389733e-05, "loss": 2.7600009441375732, "step": 258500}, {"learning_rate": 1.0845797987713013e-05, "loss": 3.144524574279785, "step": 259000}, {"learning_rate": 1.0808749721036293e-05, "loss": 3.1125171184539795, "step": 259500}, {"learning_rate": 1.0771701454359572e-05, "loss": 2.926637649536133, "step": 260000}, {"learning_rate": 1.073465227818815e-05, "loss": 3.052398920059204, "step": 260500}, {"learning_rate": 1.069760401151143e-05, "loss": 3.092421293258667, "step": 261000}, {"learning_rate": 1.0660553925845306e-05, "loss": 3.0150070190429688, "step": 261500}, {"learning_rate": 1.0623505659168586e-05, "loss": 2.682734489440918, "step": 262000}, {"learning_rate": 1.0586457392491866e-05, "loss": 3.0494251251220703, "step": 262500}, {"learning_rate": 1.0549408216320444e-05, "loss": 3.057997941970825, "step": 263000}, {"learning_rate": 1.0512359949643724e-05, "loss": 3.018179178237915, "step": 263500}, {"learning_rate": 1.0475311682967003e-05, "loss": 2.9869699478149414, "step": 264000}, {"learning_rate": 1.043826159730088e-05, "loss": 3.0160861015319824, "step": 264500}, {"learning_rate": 1.040121333062416e-05, "loss": 3.0043540000915527, "step": 265000}, {"learning_rate": 1.0364164154452737e-05, "loss": 2.903616428375244, "step": 265500}, {"learning_rate": 1.0327115887776017e-05, "loss": 2.937068462371826, "step": 266000}, {"learning_rate": 1.0290067621099297e-05, "loss": 2.815018653869629, "step": 266500}, {"learning_rate": 1.0253019354422577e-05, "loss": 3.1014556884765625, "step": 267000}], "eval_log": [{"perplexity": 791.5572205860362, "loss": 6.674002170562744, "step": 500}, {"perplexity": 425.4552251747977, "loss": 6.053159713745117, "step": 1000}, {"perplexity": 311.2549984311311, "loss": 5.740612506866455, "step": 1500}, {"perplexity": 247.68184572271306, "loss": 5.512145042419434, "step": 2000}, {"perplexity": 207.88749255123906, "loss": 5.336997032165527, "step": 2500}, {"perplexity": 179.92964582691437, "loss": 5.19256591796875, "step": 3000}, {"perplexity": 159.83651569547953, "loss": 5.074151515960693, "step": 3500}, {"perplexity": 143.01782933197498, "loss": 4.9629693031311035, "step": 4000}, {"perplexity": 129.402157190518, "loss": 4.862925052642822, "step": 4500}, {"perplexity": 117.13069999342011, "loss": 4.7632904052734375, "step": 5000}, {"perplexity": 105.39752590504294, "loss": 4.657739162445068, "step": 5500}, {"perplexity": 96.1222577250033, "loss": 4.5656208992004395, "step": 6000}, {"perplexity": 88.53200324423844, "loss": 4.483364105224609, "step": 6500}, {"perplexity": 82.99266193370931, "loss": 4.418752193450928, "step": 7000}, {"perplexity": 77.89657346940567, "loss": 4.355381965637207, "step": 7500}, {"perplexity": 74.42784918117577, "loss": 4.309830188751221, "step": 8000}, {"perplexity": 71.23413569545754, "loss": 4.265972137451172, "step": 8500}, {"perplexity": 68.20149386654313, "loss": 4.222466468811035, "step": 9000}, {"perplexity": 65.59373988120987, "loss": 4.183480262756348, "step": 9500}, {"perplexity": 63.16808314131176, "loss": 4.145799160003662, "step": 10000}, {"perplexity": 61.39088735712501, "loss": 4.1172614097595215, "step": 10500}, {"perplexity": 59.337951454745436, "loss": 4.083249092102051, "step": 11000}, {"perplexity": 57.77062379930271, "loss": 4.056480407714844, "step": 11500}, {"perplexity": 55.86889972098841, "loss": 4.023007869720459, "step": 12000}, {"perplexity": 55.080262887846914, "loss": 4.008791446685791, "step": 12500}, {"perplexity": 53.377921394929494, "loss": 3.9773972034454346, "step": 13000}, {"perplexity": 52.10969525597618, "loss": 3.9533510208129883, "step": 13500}, {"perplexity": 51.05323733164036, "loss": 3.9328689575195312, "step": 14000}, {"perplexity": 50.018173900144824, "loss": 3.912386417388916, "step": 14500}, {"perplexity": 49.18442232179093, "loss": 3.8955769538879395, "step": 15000}, {"perplexity": 48.14675535802465, "loss": 3.874253749847412, "step": 15500}, {"perplexity": 47.48436211994075, "loss": 3.860400438308716, "step": 16000}, {"perplexity": 46.701753384236284, "loss": 3.8437817096710205, "step": 16500}, {"perplexity": 46.08185639779742, "loss": 3.8304193019866943, "step": 17000}, {"perplexity": 45.11845491789324, "loss": 3.809291362762451, "step": 17500}, {"perplexity": 44.52170741058671, "loss": 3.7959768772125244, "step": 18000}, {"perplexity": 43.9426158621738, "loss": 3.7828845977783203, "step": 18500}, {"perplexity": 43.324570043691736, "loss": 3.7687199115753174, "step": 19000}, {"perplexity": 42.72750073227192, "loss": 3.754842758178711, "step": 19500}, {"perplexity": 42.204909148100526, "loss": 3.7425365447998047, "step": 20000}, {"perplexity": 41.67519393574982, "loss": 3.7299060821533203, "step": 20500}, {"perplexity": 41.207608964063475, "loss": 3.718622922897339, "step": 21000}, {"perplexity": 40.833251471149595, "loss": 3.7094967365264893, "step": 21500}, {"perplexity": 40.283662033916265, "loss": 3.695945978164673, "step": 22000}, {"perplexity": 39.83168969275355, "loss": 3.6846628189086914, "step": 22500}, {"perplexity": 39.46955907221068, "loss": 3.675529718399048, "step": 23000}, {"perplexity": 39.14338958408711, "loss": 3.667231559753418, "step": 23500}, {"perplexity": 38.58063598181844, "loss": 3.6527504920959473, "step": 24000}, {"perplexity": 38.33958619366831, "loss": 3.6464829444885254, "step": 24500}, {"perplexity": 38.015312694012685, "loss": 3.637989044189453, "step": 25000}, {"perplexity": 37.586639863499634, "loss": 3.6266486644744873, "step": 25500}, {"perplexity": 37.29012840850699, "loss": 3.6187286376953125, "step": 26000}, {"perplexity": 36.895846501078786, "loss": 3.6080989837646484, "step": 26500}, {"perplexity": 36.63736917085982, "loss": 3.6010687351226807, "step": 27000}, {"perplexity": 36.42209167468347, "loss": 3.5951755046844482, "step": 27500}, {"perplexity": 36.170348095815825, "loss": 3.5882396697998047, "step": 28000}, {"perplexity": 35.922828195992665, "loss": 3.5813729763031006, "step": 28500}, {"perplexity": 35.69646078145686, "loss": 3.5750515460968018, "step": 29000}, {"perplexity": 35.341233269097515, "loss": 3.5650503635406494, "step": 29500}, {"perplexity": 35.07558727258192, "loss": 3.5575053691864014, "step": 30000}, {"perplexity": 34.773995380126436, "loss": 3.5488698482513428, "step": 30500}, {"perplexity": 34.61386426062432, "loss": 3.5442543029785156, "step": 31000}, {"perplexity": 34.342409061991624, "loss": 3.5363810062408447, "step": 31500}, {"perplexity": 34.1972918020119, "loss": 3.532146453857422, "step": 32000}, {"perplexity": 33.97022220967611, "loss": 3.525484323501587, "step": 32500}, {"perplexity": 33.745472947145, "loss": 3.518846273422241, "step": 33000}, {"perplexity": 33.46966241890363, "loss": 3.5106394290924072, "step": 33500}, {"perplexity": 33.314846597727644, "loss": 3.5060031414031982, "step": 34000}, {"perplexity": 33.08801912927285, "loss": 3.499171257019043, "step": 34500}, {"perplexity": 32.96064782276171, "loss": 3.495314359664917, "step": 35000}, {"perplexity": 32.7229974737025, "loss": 3.4880781173706055, "step": 35500}, {"perplexity": 32.54914597742669, "loss": 3.4827511310577393, "step": 36000}, {"perplexity": 32.392355045647896, "loss": 3.4779224395751953, "step": 36500}, {"perplexity": 32.171716409132216, "loss": 3.471087694168091, "step": 37000}, {"perplexity": 32.039308180136864, "loss": 3.466963529586792, "step": 37500}, {"perplexity": 31.87032753200191, "loss": 3.4616754055023193, "step": 38000}, {"perplexity": 31.716428412493485, "loss": 3.4568347930908203, "step": 38500}, {"perplexity": 31.588612686197298, "loss": 3.452796697616577, "step": 39000}, {"perplexity": 31.42349994309749, "loss": 3.4475560188293457, "step": 39500}, {"perplexity": 31.249145894998453, "loss": 3.4419920444488525, "step": 40000}, {"perplexity": 31.121350842742647, "loss": 3.437894105911255, "step": 40500}, {"perplexity": 31.022275030848597, "loss": 3.4347054958343506, "step": 41000}, {"perplexity": 30.941915089557472, "loss": 3.4321117401123047, "step": 41500}, {"perplexity": 30.807428090510662, "loss": 3.427755832672119, "step": 42000}, {"perplexity": 30.599007655936873, "loss": 3.4209675788879395, "step": 42500}, {"perplexity": 30.50392191599384, "loss": 3.4178552627563477, "step": 43000}, {"perplexity": 30.37860955143172, "loss": 3.41373872756958, "step": 43500}, {"perplexity": 30.23122186636838, "loss": 3.4088752269744873, "step": 44000}, {"perplexity": 30.12428363066999, "loss": 3.405331611633301, "step": 44500}, {"perplexity": 30.002211776042547, "loss": 3.401271104812622, "step": 45000}, {"perplexity": 29.8838762306462, "loss": 3.3973190784454346, "step": 45500}, {"perplexity": 29.811326376017767, "loss": 3.394888401031494, "step": 46000}, {"perplexity": 29.653043050699893, "loss": 3.3895647525787354, "step": 46500}, {"perplexity": 29.520448672431137, "loss": 3.3850831985473633, "step": 47000}, {"perplexity": 29.47801056569821, "loss": 3.3836445808410645, "step": 47500}, {"perplexity": 29.267340473660614, "loss": 3.376472234725952, "step": 48000}, {"perplexity": 29.19335699118432, "loss": 3.37394118309021, "step": 48500}, {"perplexity": 29.075064438510747, "loss": 3.3698809146881104, "step": 49000}, {"perplexity": 29.003247276431257, "loss": 3.36740779876709, "step": 49500}, {"perplexity": 28.834222625598645, "loss": 3.361562967300415, "step": 50000}, {"perplexity": 28.788727983032963, "loss": 3.3599839210510254, "step": 50500}, {"perplexity": 28.751913700880298, "loss": 3.3587043285369873, "step": 51000}, {"perplexity": 28.57966540407135, "loss": 3.3526954650878906, "step": 51500}, {"perplexity": 28.48260952143674, "loss": 3.3492937088012695, "step": 52000}, {"perplexity": 28.383852994358136, "loss": 3.345820426940918, "step": 52500}, {"perplexity": 28.334474856334907, "loss": 3.3440792560577393, "step": 53000}, {"perplexity": 28.194517399598006, "loss": 3.339127540588379, "step": 53500}, {"perplexity": 28.0979186764346, "loss": 3.335695505142212, "step": 54000}, {"perplexity": 27.983244245719547, "loss": 3.331605911254883, "step": 54500}, {"perplexity": 27.886132714814128, "loss": 3.328129529953003, "step": 55000}, {"perplexity": 27.836565174886086, "loss": 3.326350450515747, "step": 55500}, {"perplexity": 27.77184589462578, "loss": 3.3240227699279785, "step": 56000}, {"perplexity": 27.645099281098382, "loss": 3.319448471069336, "step": 56500}, {"perplexity": 27.67785009440432, "loss": 3.3206324577331543, "step": 57000}, {"perplexity": 27.53734094927352, "loss": 3.3155429363250732, "step": 57500}, {"perplexity": 27.44549986562982, "loss": 3.312202215194702, "step": 58000}, {"perplexity": 27.444773544844747, "loss": 3.312175750732422, "step": 58500}, {"perplexity": 27.338702060888576, "loss": 3.3083033561706543, "step": 59000}, {"perplexity": 27.256704307788922, "loss": 3.3052995204925537, "step": 59500}, {"perplexity": 27.165896341277097, "loss": 3.301962375640869, "step": 60000}, {"perplexity": 27.15563246649171, "loss": 3.301584482192993, "step": 60500}, {"perplexity": 27.019584540258407, "loss": 3.2965619564056396, "step": 61000}, {"perplexity": 26.932789846282013, "loss": 3.293344497680664, "step": 61500}, {"perplexity": 26.884256663365743, "loss": 3.2915408611297607, "step": 62000}, {"perplexity": 26.75319103071662, "loss": 3.286653757095337, "step": 62500}, {"perplexity": 26.81850311164242, "loss": 3.2890920639038086, "step": 63000}, {"perplexity": 26.616246424418584, "loss": 3.281521797180176, "step": 63500}, {"perplexity": 26.63917735393627, "loss": 3.2823829650878906, "step": 64000}, {"perplexity": 26.604135034012657, "loss": 3.281066656112671, "step": 64500}, {"perplexity": 26.52318956483915, "loss": 3.278019428253174, "step": 65000}, {"perplexity": 26.425738643353466, "loss": 3.274338483810425, "step": 65500}, {"perplexity": 26.38191802214025, "loss": 3.272678852081299, "step": 66000}, {"perplexity": 26.334534481886948, "loss": 3.270881175994873, "step": 66500}, {"perplexity": 26.245747523822846, "loss": 3.2675039768218994, "step": 67000}, {"perplexity": 26.240848379116997, "loss": 3.267317295074463, "step": 67500}, {"perplexity": 26.138002835534998, "loss": 3.263390302658081, "step": 68000}, {"perplexity": 26.068038907084382, "loss": 3.2607100009918213, "step": 68500}, {"perplexity": 25.99769199911069, "loss": 3.258007764816284, "step": 69000}, {"perplexity": 25.923511015242973, "loss": 3.255150318145752, "step": 69500}, {"perplexity": 25.92954402828806, "loss": 3.255383014678955, "step": 70000}, {"perplexity": 25.823054329460167, "loss": 3.251267671585083, "step": 70500}, {"perplexity": 25.762082783728218, "loss": 3.248903751373291, "step": 71000}, {"perplexity": 25.73818858795744, "loss": 3.2479758262634277, "step": 71500}, {"perplexity": 25.654476397174726, "loss": 3.244718074798584, "step": 72000}, {"perplexity": 25.59342082722335, "loss": 3.242335319519043, "step": 72500}, {"perplexity": 25.59521486252535, "loss": 3.242405414581299, "step": 73000}, {"perplexity": 25.455456782820534, "loss": 3.2369301319122314, "step": 73500}, {"perplexity": 25.458218352101596, "loss": 3.2370386123657227, "step": 74000}, {"perplexity": 25.389408139064123, "loss": 3.2343320846557617, "step": 74500}, {"perplexity": 25.355471789438717, "loss": 3.232994556427002, "step": 75000}, {"perplexity": 25.254916104269377, "loss": 3.229020833969116, "step": 75500}, {"perplexity": 25.239590675984367, "loss": 3.2284138202667236, "step": 76000}, {"perplexity": 25.21880847240542, "loss": 3.2275900840759277, "step": 76500}, {"perplexity": 25.152906090044034, "loss": 3.224973440170288, "step": 77000}, {"perplexity": 25.123879766862363, "loss": 3.223818778991699, "step": 77500}, {"perplexity": 25.057418930624067, "loss": 3.221169948577881, "step": 78000}, {"perplexity": 25.060824429924015, "loss": 3.2213058471679688, "step": 78500}, {"perplexity": 24.91756145050341, "loss": 3.2155728340148926, "step": 79000}, {"perplexity": 24.89757270712961, "loss": 3.2147703170776367, "step": 79500}, {"perplexity": 24.859913274459082, "loss": 3.213256597518921, "step": 80000}, {"perplexity": 24.81578991488323, "loss": 3.211480140686035, "step": 80500}, {"perplexity": 24.72965285741315, "loss": 3.208003044128418, "step": 81000}, {"perplexity": 24.658355168724604, "loss": 3.205115795135498, "step": 81500}, {"perplexity": 24.662329699767053, "loss": 3.2052769660949707, "step": 82000}, {"perplexity": 24.59583486335474, "loss": 3.2025771141052246, "step": 82500}, {"perplexity": 24.558292230411173, "loss": 3.201049566268921, "step": 83000}, {"perplexity": 24.5520163082955, "loss": 3.200793981552124, "step": 83500}, {"perplexity": 24.44263923523041, "loss": 3.196329116821289, "step": 84000}, {"perplexity": 24.4701258874279, "loss": 3.197453022003174, "step": 84500}, {"perplexity": 24.37882909248159, "loss": 3.1937150955200195, "step": 85000}, {"perplexity": 24.3399405412247, "loss": 3.1921186447143555, "step": 85500}, {"perplexity": 24.27309960031367, "loss": 3.189368724822998, "step": 86000}, {"perplexity": 24.33813004322368, "loss": 3.192044258117676, "step": 86500}, {"perplexity": 24.29344419442418, "loss": 3.190206527709961, "step": 87000}, {"perplexity": 24.237996958168903, "loss": 3.1879215240478516, "step": 87500}, {"perplexity": 24.131808076954627, "loss": 3.183530807495117, "step": 88000}, {"perplexity": 24.10594858144448, "loss": 3.1824586391448975, "step": 88500}, {"perplexity": 24.06460921257347, "loss": 3.1807422637939453, "step": 89000}, {"perplexity": 24.04495500207854, "loss": 3.1799252033233643, "step": 89500}, {"perplexity": 24.01469932664581, "loss": 3.178666114807129, "step": 90000}, {"perplexity": 24.010777644806303, "loss": 3.1785027980804443, "step": 90500}, {"perplexity": 23.877695587078588, "loss": 3.1729447841644287, "step": 91000}, {"perplexity": 23.892421930231887, "loss": 3.1735613346099854, "step": 91500}, {"perplexity": 23.830417491095076, "loss": 3.1709628105163574, "step": 92000}, {"perplexity": 23.83709432305758, "loss": 3.1712429523468018, "step": 92500}, {"perplexity": 23.79365174746762, "loss": 3.1694188117980957, "step": 93000}, {"perplexity": 23.763213708669195, "loss": 3.1681387424468994, "step": 93500}, {"perplexity": 23.689816242413553, "loss": 3.1650452613830566, "step": 94000}, {"perplexity": 23.60060921249962, "loss": 3.1612725257873535, "step": 94500}, {"perplexity": 23.590792447345187, "loss": 3.1608564853668213, "step": 95000}, {"perplexity": 23.569979796722652, "loss": 3.1599738597869873, "step": 95500}, {"perplexity": 23.595703132227857, "loss": 3.161064624786377, "step": 96000}, {"perplexity": 23.580507510171444, "loss": 3.1604204177856445, "step": 96500}, {"perplexity": 23.47914152866274, "loss": 3.1561124324798584, "step": 97000}, {"perplexity": 23.52266602482633, "loss": 3.1579644680023193, "step": 97500}, {"perplexity": 23.461145699518035, "loss": 3.1553456783294678, "step": 98000}, {"perplexity": 23.42412311411751, "loss": 3.153766393661499, "step": 98500}, {"perplexity": 23.401309373454662, "loss": 3.152791976928711, "step": 99000}, {"perplexity": 23.4003385941853, "loss": 3.1527504920959473, "step": 99500}, {"perplexity": 23.392869404424854, "loss": 3.1524312496185303, "step": 100000}, {"perplexity": 23.345337858225594, "loss": 3.150397300720215, "step": 100500}, {"perplexity": 23.314517223931013, "loss": 3.149076223373413, "step": 101000}, {"perplexity": 23.32503649412915, "loss": 3.1495273113250732, "step": 101500}, {"perplexity": 23.246035815673096, "loss": 3.146134614944458, "step": 102000}, {"perplexity": 23.21870071450698, "loss": 3.144958019256592, "step": 102500}, {"perplexity": 23.171865581374384, "loss": 3.1429388523101807, "step": 103000}, {"perplexity": 23.16254744972135, "loss": 3.1425366401672363, "step": 103500}, {"perplexity": 23.12014129054537, "loss": 3.1407041549682617, "step": 104000}, {"perplexity": 23.083821930917324, "loss": 3.139132022857666, "step": 104500}, {"perplexity": 23.077097496643905, "loss": 3.138840675354004, "step": 105000}, {"perplexity": 23.057095364118343, "loss": 3.1379735469818115, "step": 105500}, {"perplexity": 23.02009567452428, "loss": 3.1363675594329834, "step": 106000}, {"perplexity": 22.966125559317337, "loss": 3.1340203285217285, "step": 106500}, {"perplexity": 22.921439313524097, "loss": 3.132072687149048, "step": 107000}, {"perplexity": 22.86787496339118, "loss": 3.129733085632324, "step": 107500}, {"perplexity": 22.908338236018725, "loss": 3.1315009593963623, "step": 108000}, {"perplexity": 22.83282831839598, "loss": 3.128199338912964, "step": 108500}, {"perplexity": 22.82377712255352, "loss": 3.127802848815918, "step": 109000}, {"perplexity": 22.815485611537447, "loss": 3.127439498901367, "step": 109500}, {"perplexity": 22.790134684954747, "loss": 3.1263277530670166, "step": 110000}, {"perplexity": 22.69830774087932, "loss": 3.1222903728485107, "step": 110500}, {"perplexity": 22.712355443096833, "loss": 3.1229090690612793, "step": 111000}, {"perplexity": 22.659096013488785, "loss": 3.120561361312866, "step": 111500}, {"perplexity": 22.647246358230486, "loss": 3.1200382709503174, "step": 112000}, {"perplexity": 22.619250943727756, "loss": 3.1188013553619385, "step": 112500}, {"perplexity": 22.544097283429885, "loss": 3.1154732704162598, "step": 113000}, {"perplexity": 22.580816437916155, "loss": 3.117100715637207, "step": 113500}, {"perplexity": 22.575767104917563, "loss": 3.1168770790100098, "step": 114000}, {"perplexity": 22.534354638158902, "loss": 3.1150410175323486, "step": 114500}, {"perplexity": 22.469364752146834, "loss": 3.1121528148651123, "step": 115000}, {"perplexity": 22.463306673014525, "loss": 3.1118831634521484, "step": 115500}, {"perplexity": 22.418642005898555, "loss": 3.1098928451538086, "step": 116000}, {"perplexity": 22.438566401881275, "loss": 3.110781192779541, "step": 116500}, {"perplexity": 22.36029698296053, "loss": 3.1072869300842285, "step": 117000}, {"perplexity": 22.355344929989727, "loss": 3.107065439224243, "step": 117500}, {"perplexity": 22.377677822851567, "loss": 3.1080639362335205, "step": 118000}, {"perplexity": 22.34744736969142, "loss": 3.1067121028900146, "step": 118500}, {"perplexity": 22.32094540836831, "loss": 3.105525493621826, "step": 119000}, {"perplexity": 22.258523835739954, "loss": 3.102725028991699, "step": 119500}, {"perplexity": 22.27731799929334, "loss": 3.1035690307617188, "step": 120000}, {"perplexity": 22.273090584551074, "loss": 3.103379249572754, "step": 120500}, {"perplexity": 22.220098331216075, "loss": 3.10099720954895, "step": 121000}, {"perplexity": 22.182781028299676, "loss": 3.099316358566284, "step": 121500}, {"perplexity": 22.162100938625834, "loss": 3.098383665084839, "step": 122000}, {"perplexity": 22.14425925659116, "loss": 3.097578287124634, "step": 122500}, {"perplexity": 22.169140153312966, "loss": 3.098701238632202, "step": 123000}, {"perplexity": 22.128278388470147, "loss": 3.0968563556671143, "step": 123500}, {"perplexity": 22.085275035752428, "loss": 3.0949110984802246, "step": 124000}, {"perplexity": 22.063738953613996, "loss": 3.093935489654541, "step": 124500}, {"perplexity": 22.00694718870031, "loss": 3.091358184814453, "step": 125000}, {"perplexity": 22.023622793776408, "loss": 3.092115640640259, "step": 125500}, {"perplexity": 21.982851045136574, "loss": 3.0902626514434814, "step": 126000}, {"perplexity": 21.96710188236187, "loss": 3.089545965194702, "step": 126500}, {"perplexity": 21.940538327731577, "loss": 3.0883359909057617, "step": 127000}, {"perplexity": 21.939889689355137, "loss": 3.088306427001953, "step": 127500}, {"perplexity": 21.945142118807677, "loss": 3.088545799255371, "step": 128000}, {"perplexity": 21.882300212844154, "loss": 3.0856781005859375, "step": 128500}, {"perplexity": 21.865992274236138, "loss": 3.084932565689087, "step": 129000}, {"perplexity": 21.869944281508268, "loss": 3.085113286972046, "step": 129500}, {"perplexity": 21.79680212491413, "loss": 3.08176326751709, "step": 130000}, {"perplexity": 21.743947677852155, "loss": 3.0793354511260986, "step": 130500}, {"perplexity": 21.767839039742043, "loss": 3.0804336071014404, "step": 131000}, {"perplexity": 21.722200636000707, "loss": 3.0783348083496094, "step": 131500}, {"perplexity": 21.68271062867522, "loss": 3.0765151977539062, "step": 132000}, {"perplexity": 21.687105200853217, "loss": 3.0767178535461426, "step": 132500}, {"perplexity": 21.71470794995548, "loss": 3.0779898166656494, "step": 133000}, {"perplexity": 21.677040361741142, "loss": 3.076253652572632, "step": 133500}, {"perplexity": 21.65960467168434, "loss": 3.075448989868164, "step": 134000}, {"perplexity": 21.603935555249556, "loss": 3.072875499725342, "step": 134500}, {"perplexity": 21.64792673493161, "loss": 3.0749096870422363, "step": 135000}, {"perplexity": 21.618218235516295, "loss": 3.0735363960266113, "step": 135500}, {"perplexity": 21.59869269787134, "loss": 3.0726327896118164, "step": 136000}, {"perplexity": 21.581525842387716, "loss": 3.0718376636505127, "step": 136500}, {"perplexity": 21.5629999353123, "loss": 3.070978879928589, "step": 137000}, {"perplexity": 21.52789994568753, "loss": 3.069349765777588, "step": 137500}, {"perplexity": 21.473695644932217, "loss": 3.066828727722168, "step": 138000}, {"perplexity": 21.502298118400308, "loss": 3.068159818649292, "step": 138500}, {"perplexity": 21.484567578735383, "loss": 3.0673348903656006, "step": 139000}, {"perplexity": 21.44477331737592, "loss": 3.065480947494507, "step": 139500}, {"perplexity": 21.40242469373842, "loss": 3.063504219055176, "step": 140000}, {"perplexity": 21.39208394741587, "loss": 3.063020944595337, "step": 140500}, {"perplexity": 21.31960466315698, "loss": 3.059627056121826, "step": 141000}, {"perplexity": 21.328135626488624, "loss": 3.0600271224975586, "step": 141500}, {"perplexity": 21.24259953093119, "loss": 3.0560085773468018, "step": 142000}, {"perplexity": 21.244002479877274, "loss": 3.056074619293213, "step": 142500}, {"perplexity": 21.287697221492756, "loss": 3.05812931060791, "step": 143000}, {"perplexity": 21.28035440965802, "loss": 3.05778431892395, "step": 143500}, {"perplexity": 21.23616842401922, "loss": 3.0557057857513428, "step": 144000}, {"perplexity": 21.247512790528983, "loss": 3.0562398433685303, "step": 144500}, {"perplexity": 21.189315653605394, "loss": 3.053497076034546, "step": 145000}, {"perplexity": 21.20616041668248, "loss": 3.0542917251586914, "step": 145500}, {"perplexity": 21.09542709377569, "loss": 3.0490562915802, "step": 146000}, {"perplexity": 21.11727667160561, "loss": 3.050091505050659, "step": 146500}, {"perplexity": 21.075177824343243, "loss": 3.048095941543579, "step": 147000}, {"perplexity": 21.080363967134673, "loss": 3.048341989517212, "step": 147500}, {"perplexity": 21.02710602274646, "loss": 3.0458123683929443, "step": 148000}, {"perplexity": 21.040169602482347, "loss": 3.046433448791504, "step": 148500}, {"perplexity": 21.05130888567, "loss": 3.0469627380371094, "step": 149000}, {"perplexity": 21.051599991025597, "loss": 3.0469765663146973, "step": 149500}, {"perplexity": 21.021010791012255, "loss": 3.045522451400757, "step": 150000}, {"perplexity": 20.988589428871236, "loss": 3.0439789295196533, "step": 150500}, {"perplexity": 20.93454012832531, "loss": 3.04140043258667, "step": 151000}, {"perplexity": 20.952486128800835, "loss": 3.042257308959961, "step": 151500}, {"perplexity": 20.886943491108507, "loss": 3.0391242504119873, "step": 152000}, {"perplexity": 20.874711618589217, "loss": 3.0385384559631348, "step": 152500}, {"perplexity": 20.898998170027188, "loss": 3.039701223373413, "step": 153000}, {"perplexity": 20.84282896289747, "loss": 3.0370099544525146, "step": 153500}, {"perplexity": 20.809342812878636, "loss": 3.0354020595550537, "step": 154000}, {"perplexity": 20.82757873950701, "loss": 3.036278009414673, "step": 154500}, {"perplexity": 20.778352613313867, "loss": 3.03391170501709, "step": 155000}, {"perplexity": 20.821159102589498, "loss": 3.0359697341918945, "step": 155500}, {"perplexity": 20.80193685948388, "loss": 3.035046100616455, "step": 156000}, {"perplexity": 20.767595429627047, "loss": 3.0333938598632812, "step": 156500}, {"perplexity": 20.776653479550326, "loss": 3.033829927444458, "step": 157000}, {"perplexity": 20.807030959689396, "loss": 3.0352909564971924, "step": 157500}, {"perplexity": 20.646694362179986, "loss": 3.027555227279663, "step": 158000}, {"perplexity": 20.687236902417816, "loss": 3.0295169353485107, "step": 158500}, {"perplexity": 20.65927532308343, "loss": 3.0281643867492676, "step": 159000}, {"perplexity": 20.67065648907341, "loss": 3.028715133666992, "step": 159500}, {"perplexity": 20.63308798147336, "loss": 3.0268959999084473, "step": 160000}, {"perplexity": 20.660447638449963, "loss": 3.0282211303710938, "step": 160500}, {"perplexity": 20.62876435971452, "loss": 3.026686429977417, "step": 161000}, {"perplexity": 20.56751467281683, "loss": 3.0237128734588623, "step": 161500}, {"perplexity": 20.57922798911704, "loss": 3.024282217025757, "step": 162000}, {"perplexity": 20.622622341681335, "loss": 3.026388645172119, "step": 162500}, {"perplexity": 20.588542577091744, "loss": 3.0247347354888916, "step": 163000}, {"perplexity": 20.515811643309963, "loss": 3.021195888519287, "step": 163500}, {"perplexity": 20.467966523207227, "loss": 3.0188610553741455, "step": 164000}, {"perplexity": 20.433976690745084, "loss": 3.0171990394592285, "step": 164500}, {"perplexity": 20.445769944657204, "loss": 3.0177760124206543, "step": 165000}, {"perplexity": 20.456087309363465, "loss": 3.018280506134033, "step": 165500}, {"perplexity": 20.492659280288706, "loss": 3.020066738128662, "step": 166000}, {"perplexity": 20.425593974550885, "loss": 3.0167887210845947, "step": 166500}, {"perplexity": 20.497535919489916, "loss": 3.0203046798706055, "step": 167000}, {"perplexity": 20.487715416063377, "loss": 3.0198254585266113, "step": 167500}, {"perplexity": 20.45540452518088, "loss": 3.018247127532959, "step": 168000}, {"perplexity": 20.348800078595563, "loss": 3.013021945953369, "step": 168500}, {"perplexity": 20.43895144453261, "loss": 3.017442464828491, "step": 169000}, {"perplexity": 20.375272687342257, "loss": 3.01432204246521, "step": 169500}, {"perplexity": 20.363272491097018, "loss": 3.01373291015625, "step": 170000}, {"perplexity": 20.423422140896207, "loss": 3.0166823863983154, "step": 170500}, {"perplexity": 20.42818002386493, "loss": 3.0169153213500977, "step": 171000}, {"perplexity": 20.384295700130448, "loss": 3.0147647857666016, "step": 171500}, {"perplexity": 20.402840150582225, "loss": 3.015674114227295, "step": 172000}, {"perplexity": 20.338226486896435, "loss": 3.0125021934509277, "step": 172500}, {"perplexity": 20.328380529469843, "loss": 3.0120179653167725, "step": 173000}, {"perplexity": 20.308373660482253, "loss": 3.011033296585083, "step": 173500}, {"perplexity": 20.294554757970797, "loss": 3.010352611541748, "step": 174000}, {"perplexity": 20.278429279028778, "loss": 3.0095577239990234, "step": 174500}, {"perplexity": 20.228538072401545, "loss": 3.007094383239746, "step": 175000}, {"perplexity": 20.24863043049036, "loss": 3.008087158203125, "step": 175500}, {"perplexity": 20.232203777572398, "loss": 3.0072755813598633, "step": 176000}, {"perplexity": 20.228287285272312, "loss": 3.007081985473633, "step": 176500}, {"perplexity": 20.21060980812761, "loss": 3.0062077045440674, "step": 177000}, {"perplexity": 20.19302480943959, "loss": 3.0053372383117676, "step": 177500}, {"perplexity": 20.236970187409703, "loss": 3.0075111389160156, "step": 178000}, {"perplexity": 20.216966521033818, "loss": 3.0065221786499023, "step": 178500}, {"perplexity": 20.18718579596763, "loss": 3.0050480365753174, "step": 179000}, {"perplexity": 20.192167865797078, "loss": 3.0052947998046875, "step": 179500}, {"perplexity": 20.168467096256975, "loss": 3.004120349884033, "step": 180000}, {"perplexity": 20.199655315959035, "loss": 3.0056655406951904, "step": 180500}, {"perplexity": 20.171140820189326, "loss": 3.0042529106140137, "step": 181000}, {"perplexity": 20.15598799578201, "loss": 3.0035014152526855, "step": 181500}, {"perplexity": 20.17098692718492, "loss": 3.0042452812194824, "step": 182000}, {"perplexity": 20.10991215020647, "loss": 3.0012128353118896, "step": 182500}, {"perplexity": 20.165981235699277, "loss": 3.0039970874786377, "step": 183000}, {"perplexity": 20.079341216831985, "loss": 2.9996914863586426, "step": 183500}, {"perplexity": 20.083348576723594, "loss": 2.9998910427093506, "step": 184000}, {"perplexity": 20.05891468126891, "loss": 2.998673677444458, "step": 184500}, {"perplexity": 20.040974294343634, "loss": 2.99777889251709, "step": 185000}, {"perplexity": 20.028779413136075, "loss": 2.9971702098846436, "step": 185500}, {"perplexity": 20.024825910349897, "loss": 2.9969727993011475, "step": 186000}, {"perplexity": 20.000973449093866, "loss": 2.9957809448242188, "step": 186500}, {"perplexity": 20.011433731294154, "loss": 2.9963037967681885, "step": 187000}, {"perplexity": 20.001827047365207, "loss": 2.995823621749878, "step": 187500}, {"perplexity": 20.03044126320552, "loss": 2.997253179550171, "step": 188000}, {"perplexity": 20.023493927590582, "loss": 2.996906280517578, "step": 188500}, {"perplexity": 19.963408330122647, "loss": 2.993901014328003, "step": 189000}, {"perplexity": 19.964607797312087, "loss": 2.9939610958099365, "step": 189500}, {"perplexity": 19.899830461234554, "loss": 2.990711212158203, "step": 190000}, {"perplexity": 19.838323262910166, "loss": 2.9876155853271484, "step": 190500}, {"perplexity": 19.891633672321515, "loss": 2.9902992248535156, "step": 191000}, {"perplexity": 19.87654390973293, "loss": 2.9895403385162354, "step": 191500}, {"perplexity": 19.837996907680346, "loss": 2.9875991344451904, "step": 192000}, {"perplexity": 19.8525746117944, "loss": 2.9883337020874023, "step": 192500}, {"perplexity": 19.85451059424064, "loss": 2.988431215286255, "step": 193000}, {"perplexity": 19.865112142578084, "loss": 2.9889650344848633, "step": 193500}, {"perplexity": 19.77692129999357, "loss": 2.98451566696167, "step": 194000}, {"perplexity": 19.76320486902715, "loss": 2.9838218688964844, "step": 194500}, {"perplexity": 19.808916110002265, "loss": 2.9861321449279785, "step": 195000}, {"perplexity": 19.856172186920624, "loss": 2.9885149002075195, "step": 195500}, {"perplexity": 19.731015746998597, "loss": 2.982191801071167, "step": 196000}, {"perplexity": 19.756331380266403, "loss": 2.983474016189575, "step": 196500}, {"perplexity": 19.74415435961083, "loss": 2.9828574657440186, "step": 197000}, {"perplexity": 19.67879263994541, "loss": 2.979541540145874, "step": 197500}, {"perplexity": 19.69283548309039, "loss": 2.980254888534546, "step": 198000}, {"perplexity": 19.742469191914015, "loss": 2.9827721118927, "step": 198500}, {"perplexity": 19.72427101883022, "loss": 2.9818499088287354, "step": 199000}, {"perplexity": 19.665974155180532, "loss": 2.9788899421691895, "step": 199500}, {"perplexity": 19.68993879609698, "loss": 2.9801077842712402, "step": 200000}, {"perplexity": 19.709369305235196, "loss": 2.9810941219329834, "step": 200500}, {"perplexity": 19.655014950727402, "loss": 2.97833251953125, "step": 201000}, {"perplexity": 19.64103185665873, "loss": 2.977620840072632, "step": 201500}, {"perplexity": 19.663761197432734, "loss": 2.9787774085998535, "step": 202000}, {"perplexity": 19.639411679216217, "loss": 2.9775383472442627, "step": 202500}, {"perplexity": 19.65551168580231, "loss": 2.9783577919006348, "step": 203000}, {"perplexity": 19.652433067794373, "loss": 2.978201150894165, "step": 203500}, {"perplexity": 19.61668242213997, "loss": 2.9763803482055664, "step": 204000}, {"perplexity": 19.61969462950162, "loss": 2.976533889770508, "step": 204500}, {"perplexity": 19.578237938703936, "loss": 2.9744186401367188, "step": 205000}, {"perplexity": 19.58540434703266, "loss": 2.9747846126556396, "step": 205500}, {"perplexity": 19.560321729169903, "loss": 2.9735031127929688, "step": 206000}, {"perplexity": 19.57846666300783, "loss": 2.9744303226470947, "step": 206500}, {"perplexity": 19.565970096464177, "loss": 2.9737918376922607, "step": 207000}, {"perplexity": 19.59043407040795, "loss": 2.975041389465332, "step": 207500}, {"perplexity": 19.519181296367684, "loss": 2.971397638320923, "step": 208000}, {"perplexity": 19.550395563192467, "loss": 2.9729955196380615, "step": 208500}, {"perplexity": 19.5318295918743, "loss": 2.972045421600342, "step": 209000}, {"perplexity": 19.49422067122103, "loss": 2.9701180458068848, "step": 209500}, {"perplexity": 19.509410898001708, "loss": 2.9708969593048096, "step": 210000}, {"perplexity": 19.48621418333771, "loss": 2.9697072505950928, "step": 210500}, {"perplexity": 19.517999283349052, "loss": 2.971337080001831, "step": 211000}, {"perplexity": 19.48182432554067, "loss": 2.969481945037842, "step": 211500}, {"perplexity": 19.483779896639447, "loss": 2.9695823192596436, "step": 212000}, {"perplexity": 19.4713251975639, "loss": 2.968942880630493, "step": 212500}, {"perplexity": 19.44907332800576, "loss": 2.967799425125122, "step": 213000}, {"perplexity": 19.441289328797687, "loss": 2.9673991203308105, "step": 213500}, {"perplexity": 19.419882028006505, "loss": 2.9662973880767822, "step": 214000}, {"perplexity": 19.40281386923163, "loss": 2.9654181003570557, "step": 214500}, {"perplexity": 19.38781295523664, "loss": 2.96464467048645, "step": 215000}, {"perplexity": 19.31583721782276, "loss": 2.960925340652466, "step": 215500}, {"perplexity": 19.383228061949247, "loss": 2.9644081592559814, "step": 216000}, {"perplexity": 19.28347560270611, "loss": 2.9592485427856445, "step": 216500}, {"perplexity": 19.307567951381415, "loss": 2.9604971408843994, "step": 217000}, {"perplexity": 19.29837738348537, "loss": 2.9600210189819336, "step": 217500}, {"perplexity": 19.329510450269993, "loss": 2.9616329669952393, "step": 218000}, {"perplexity": 19.329975915830286, "loss": 2.9616570472717285, "step": 218500}, {"perplexity": 19.311048346918135, "loss": 2.9606773853302, "step": 219000}, {"perplexity": 19.314874743619857, "loss": 2.9608755111694336, "step": 219500}, {"perplexity": 19.339043207951033, "loss": 2.9621260166168213, "step": 220000}, {"perplexity": 19.30842878447654, "loss": 2.9605417251586914, "step": 220500}, {"perplexity": 19.326229466493587, "loss": 2.961463212966919, "step": 221000}, {"perplexity": 19.320728617289472, "loss": 2.9611785411834717, "step": 221500}, {"perplexity": 19.30418943734774, "loss": 2.960322141647339, "step": 222000}, {"perplexity": 19.303043454487742, "loss": 2.9602627754211426, "step": 222500}, {"perplexity": 19.26890691064779, "loss": 2.9584927558898926, "step": 223000}, {"perplexity": 19.204056200832177, "loss": 2.9551215171813965, "step": 223500}, {"perplexity": 19.216124612315102, "loss": 2.955749750137329, "step": 224000}, {"perplexity": 19.216976786699583, "loss": 2.955794095993042, "step": 224500}, {"perplexity": 19.15615973303947, "loss": 2.9526243209838867, "step": 225000}, {"perplexity": 19.197509913413402, "loss": 2.9547805786132812, "step": 225500}, {"perplexity": 19.167827874085546, "loss": 2.953233242034912, "step": 226000}, {"perplexity": 19.154323812899477, "loss": 2.952528476715088, "step": 226500}, {"perplexity": 19.133798158582156, "loss": 2.951456308364868, "step": 227000}, {"perplexity": 19.126746833512716, "loss": 2.951087713241577, "step": 227500}, {"perplexity": 19.203374000984542, "loss": 2.9550859928131104, "step": 228000}, {"perplexity": 19.14156400787603, "loss": 2.951862096786499, "step": 228500}, {"perplexity": 19.09504346200155, "loss": 2.9494287967681885, "step": 229000}, {"perplexity": 19.08952649202724, "loss": 2.9491398334503174, "step": 229500}, {"perplexity": 19.127900591775788, "loss": 2.95114803314209, "step": 230000}, {"perplexity": 19.142746043834826, "loss": 2.9519238471984863, "step": 230500}, {"perplexity": 19.09834894520191, "loss": 2.949601888656616, "step": 231000}, {"perplexity": 19.05020287571182, "loss": 2.947077751159668, "step": 231500}, {"perplexity": 19.051701769035514, "loss": 2.9471564292907715, "step": 232000}, {"perplexity": 19.039378009609372, "loss": 2.94650936126709, "step": 232500}, {"perplexity": 19.05280103242646, "loss": 2.947214126586914, "step": 233000}, {"perplexity": 19.04277373979882, "loss": 2.946687698364258, "step": 233500}, {"perplexity": 19.009663994662162, "loss": 2.9449474811553955, "step": 234000}, {"perplexity": 19.05077970857704, "loss": 2.947108030319214, "step": 234500}, {"perplexity": 19.026517861084933, "loss": 2.945833683013916, "step": 235000}, {"perplexity": 18.9959362148414, "loss": 2.9442250728607178, "step": 235500}, {"perplexity": 19.00120868393023, "loss": 2.944502592086792, "step": 236000}, {"perplexity": 18.95728899974572, "loss": 2.9421885013580322, "step": 236500}, {"perplexity": 18.933520882860126, "loss": 2.9409339427948, "step": 237000}, {"perplexity": 18.93739437953345, "loss": 2.941138505935669, "step": 237500}, {"perplexity": 18.930902884051527, "loss": 2.940795660018921, "step": 238000}, {"perplexity": 18.97854382481544, "loss": 2.9433090686798096, "step": 238500}, {"perplexity": 18.97337264024571, "loss": 2.9430365562438965, "step": 239000}, {"perplexity": 18.970477754204385, "loss": 2.9428839683532715, "step": 239500}, {"perplexity": 19.00771975580695, "loss": 2.944845199584961, "step": 240000}, {"perplexity": 18.924869324369862, "loss": 2.940476894378662, "step": 240500}, {"perplexity": 18.948323896765057, "loss": 2.9417154788970947, "step": 241000}, {"perplexity": 18.97111097278145, "loss": 2.9429173469543457, "step": 241500}, {"perplexity": 18.96951439790896, "loss": 2.942833185195923, "step": 242000}, {"perplexity": 18.966470875308538, "loss": 2.9426727294921875, "step": 242500}, {"perplexity": 18.91945113954508, "loss": 2.940190553665161, "step": 243000}, {"perplexity": 18.92680960122196, "loss": 2.940579414367676, "step": 243500}, {"perplexity": 18.94767336885796, "loss": 2.941681146621704, "step": 244000}, {"perplexity": 18.86840432490725, "loss": 2.9374887943267822, "step": 244500}, {"perplexity": 18.851730968307702, "loss": 2.9366047382354736, "step": 245000}, {"perplexity": 18.821627705710785, "loss": 2.935006618499756, "step": 245500}, {"perplexity": 18.849268086807836, "loss": 2.936474084854126, "step": 246000}, {"perplexity": 18.838260963391463, "loss": 2.935889959335327, "step": 246500}, {"perplexity": 18.867657575711828, "loss": 2.9374492168426514, "step": 247000}, {"perplexity": 18.830820197692063, "loss": 2.935494899749756, "step": 247500}, {"perplexity": 18.825958569734414, "loss": 2.935236692428589, "step": 248000}, {"perplexity": 18.83876400594588, "loss": 2.9359166622161865, "step": 248500}, {"perplexity": 18.80535436424121, "loss": 2.9341416358947754, "step": 249000}, {"perplexity": 18.81529701290965, "loss": 2.9346702098846436, "step": 249500}, {"perplexity": 18.802278903287018, "loss": 2.9339780807495117, "step": 250000}, {"perplexity": 18.80097445006648, "loss": 2.933908700942993, "step": 250500}, {"perplexity": 18.78225140473474, "loss": 2.9329123497009277, "step": 251000}, {"perplexity": 18.79223560349217, "loss": 2.933443784713745, "step": 251500}, {"perplexity": 18.816728074653998, "loss": 2.934746265411377, "step": 252000}, {"perplexity": 18.785775951053065, "loss": 2.9330999851226807, "step": 252500}, {"perplexity": 18.803417572170087, "loss": 2.9340386390686035, "step": 253000}, {"perplexity": 18.73384361322595, "loss": 2.9303317070007324, "step": 253500}, {"perplexity": 18.769323283237803, "loss": 2.9322237968444824, "step": 254000}, {"perplexity": 18.739704573091945, "loss": 2.9306445121765137, "step": 254500}, {"perplexity": 18.774613425880837, "loss": 2.9325056076049805, "step": 255000}, {"perplexity": 18.736992757825316, "loss": 2.930499792098999, "step": 255500}, {"perplexity": 18.743882519395594, "loss": 2.9308674335479736, "step": 256000}, {"perplexity": 18.728225603334305, "loss": 2.9300317764282227, "step": 256500}, {"perplexity": 18.707175297081367, "loss": 2.9289071559906006, "step": 257000}, {"perplexity": 18.73362922262669, "loss": 2.9303202629089355, "step": 257500}, {"perplexity": 18.716200332871775, "loss": 2.929389476776123, "step": 258000}, {"perplexity": 18.697004487814016, "loss": 2.92836332321167, "step": 258500}, {"perplexity": 18.68107049875942, "loss": 2.9275107383728027, "step": 259000}, {"perplexity": 18.69281470687588, "loss": 2.9281392097473145, "step": 259500}, {"perplexity": 18.675775545216613, "loss": 2.927227258682251, "step": 260000}, {"perplexity": 18.67261442994945, "loss": 2.927057981491089, "step": 260500}, {"perplexity": 18.720855079951907, "loss": 2.929638147354126, "step": 261000}, {"perplexity": 18.649558938764812, "loss": 2.9258224964141846, "step": 261500}, {"perplexity": 18.65004360280881, "loss": 2.9258484840393066, "step": 262000}, {"perplexity": 18.62191393953232, "loss": 2.9243390560150146, "step": 262500}, {"perplexity": 18.61975187745569, "loss": 2.924222946166992, "step": 263000}, {"perplexity": 18.616826612003536, "loss": 2.9240658283233643, "step": 263500}, {"perplexity": 18.63590015167409, "loss": 2.9250898361206055, "step": 264000}, {"perplexity": 18.664762932693066, "loss": 2.9266374111175537, "step": 264500}, {"perplexity": 18.648780834762405, "loss": 2.925780773162842, "step": 265000}, {"perplexity": 18.613231695286103, "loss": 2.923872709274292, "step": 265500}, {"perplexity": 18.63621117440787, "loss": 2.9251065254211426, "step": 266000}, {"perplexity": 18.590462275020485, "loss": 2.9226486682891846, "step": 266500}, {"perplexity": 18.583575738611543, "loss": 2.9222781658172607, "step": 267000}]}
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3519958058426153098a15ea5dd3a21c316b10c47d2733fa01ff7b987a5cc390
3
+ size 3096134690
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.36.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": true,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "model_max_length": 1024,
19
+ "pad_token": null,
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff