Upload 13 files

Browse files

Files changed (13) hide show

config.json +39 -0
generation_config.json +6 -0
merges.txt +0 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +5 -0
tokenizer.json +0 -0
tokenizer_config.json +19 -0
trainer_state.json +1421 -0
training_args.bin +3 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "openai-community/gpt2",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "vocab_size": 50257
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.40.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4cc3830150f1b96e749b05aa4896a89c073f25562750ec50d33590ad9858c03
+size 497774208

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcad6df9b1bc1c01e062177b716a8d4514fb35f9dbfa9c8d9343ff984a5c6635
+size 995642298

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40c45e9f4c00aea5599fd7092cf44a64857d6b6f915890545cb7f11f53e7b0f0
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:726fc45979fcbc0b528e0eee04eba68f1b23676fca33f522bab3252706de38d2
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1421 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 11.889192723814054,
+  "eval_steps": 500,
+  "global_step": 100000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.059445963619070265,
+      "grad_norm": 2.0723960399627686,
+      "learning_rate": 4.985138509095233e-05,
+      "loss": 3.6788,
+      "step": 500
+    },
+    {
+      "epoch": 0.11889192723814053,
+      "grad_norm": 1.9278995990753174,
+      "learning_rate": 4.970277018190465e-05,
+      "loss": 3.4742,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1783378908572108,
+      "grad_norm": 1.4848977327346802,
+      "learning_rate": 4.955415527285698e-05,
+      "loss": 3.3942,
+      "step": 1500
+    },
+    {
+      "epoch": 0.23778385447628106,
+      "grad_norm": 1.3492341041564941,
+      "learning_rate": 4.94055403638093e-05,
+      "loss": 3.3358,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2972298180953513,
+      "grad_norm": 1.212128758430481,
+      "learning_rate": 4.925692545476163e-05,
+      "loss": 3.2851,
+      "step": 2500
+    },
+    {
+      "epoch": 0.3566757817144216,
+      "grad_norm": 1.1597293615341187,
+      "learning_rate": 4.9108310545713945e-05,
+      "loss": 3.2331,
+      "step": 3000
+    },
+    {
+      "epoch": 0.41612174533349183,
+      "grad_norm": 0.9653922319412231,
+      "learning_rate": 4.8959695636666275e-05,
+      "loss": 3.2339,
+      "step": 3500
+    },
+    {
+      "epoch": 0.4755677089525621,
+      "grad_norm": 1.0085793733596802,
+      "learning_rate": 4.88110807276186e-05,
+      "loss": 3.1856,
+      "step": 4000
+    },
+    {
+      "epoch": 0.5350136725716323,
+      "grad_norm": 1.0556505918502808,
+      "learning_rate": 4.866246581857092e-05,
+      "loss": 3.1748,
+      "step": 4500
+    },
+    {
+      "epoch": 0.5944596361907026,
+      "grad_norm": 0.9526228904724121,
+      "learning_rate": 4.851385090952324e-05,
+      "loss": 3.1529,
+      "step": 5000
+    },
+    {
+      "epoch": 0.6539055998097729,
+      "grad_norm": 0.984980046749115,
+      "learning_rate": 4.836523600047557e-05,
+      "loss": 3.1378,
+      "step": 5500
+    },
+    {
+      "epoch": 0.7133515634288432,
+      "grad_norm": 1.0135027170181274,
+      "learning_rate": 4.8216621091427895e-05,
+      "loss": 3.0848,
+      "step": 6000
+    },
+    {
+      "epoch": 0.7727975270479135,
+      "grad_norm": 0.9454924464225769,
+      "learning_rate": 4.806800618238022e-05,
+      "loss": 3.0916,
+      "step": 6500
+    },
+    {
+      "epoch": 0.8322434906669837,
+      "grad_norm": 0.9793129563331604,
+      "learning_rate": 4.791939127333254e-05,
+      "loss": 3.0642,
+      "step": 7000
+    },
+    {
+      "epoch": 0.891689454286054,
+      "grad_norm": 0.9016062617301941,
+      "learning_rate": 4.777077636428487e-05,
+      "loss": 3.0657,
+      "step": 7500
+    },
+    {
+      "epoch": 0.9511354179051242,
+      "grad_norm": 0.8690605163574219,
+      "learning_rate": 4.762216145523719e-05,
+      "loss": 3.0281,
+      "step": 8000
+    },
+    {
+      "epoch": 1.0105813815241944,
+      "grad_norm": 0.891808271408081,
+      "learning_rate": 4.7473546546189516e-05,
+      "loss": 3.0155,
+      "step": 8500
+    },
+    {
+      "epoch": 1.0700273451432647,
+      "grad_norm": 0.9521974325180054,
+      "learning_rate": 4.732493163714184e-05,
+      "loss": 2.9713,
+      "step": 9000
+    },
+    {
+      "epoch": 1.129473308762335,
+      "grad_norm": 0.9132643938064575,
+      "learning_rate": 4.717631672809417e-05,
+      "loss": 2.9663,
+      "step": 9500
+    },
+    {
+      "epoch": 1.1889192723814053,
+      "grad_norm": 0.909182608127594,
+      "learning_rate": 4.702770181904649e-05,
+      "loss": 2.9616,
+      "step": 10000
+    },
+    {
+      "epoch": 1.2483652360004756,
+      "grad_norm": 0.912726104259491,
+      "learning_rate": 4.687908690999881e-05,
+      "loss": 2.9653,
+      "step": 10500
+    },
+    {
+      "epoch": 1.3078111996195458,
+      "grad_norm": 0.8568936586380005,
+      "learning_rate": 4.6730472000951136e-05,
+      "loss": 2.9486,
+      "step": 11000
+    },
+    {
+      "epoch": 1.3672571632386161,
+      "grad_norm": 0.9120291471481323,
+      "learning_rate": 4.6581857091903465e-05,
+      "loss": 2.932,
+      "step": 11500
+    },
+    {
+      "epoch": 1.4267031268576864,
+      "grad_norm": 0.981961190700531,
+      "learning_rate": 4.643324218285579e-05,
+      "loss": 2.9345,
+      "step": 12000
+    },
+    {
+      "epoch": 1.4861490904767567,
+      "grad_norm": 0.9763424396514893,
+      "learning_rate": 4.628462727380811e-05,
+      "loss": 2.9193,
+      "step": 12500
+    },
+    {
+      "epoch": 1.545595054095827,
+      "grad_norm": 0.8868328332901001,
+      "learning_rate": 4.6136012364760434e-05,
+      "loss": 2.9164,
+      "step": 13000
+    },
+    {
+      "epoch": 1.605041017714897,
+      "grad_norm": 0.9175488352775574,
+      "learning_rate": 4.598739745571276e-05,
+      "loss": 2.8932,
+      "step": 13500
+    },
+    {
+      "epoch": 1.6644869813339676,
+      "grad_norm": 0.890186607837677,
+      "learning_rate": 4.583878254666508e-05,
+      "loss": 2.8933,
+      "step": 14000
+    },
+    {
+      "epoch": 1.7239329449530376,
+      "grad_norm": 0.9198343753814697,
+      "learning_rate": 4.569016763761741e-05,
+      "loss": 2.881,
+      "step": 14500
+    },
+    {
+      "epoch": 1.783378908572108,
+      "grad_norm": 0.9706104397773743,
+      "learning_rate": 4.554155272856973e-05,
+      "loss": 2.8705,
+      "step": 15000
+    },
+    {
+      "epoch": 1.8428248721911782,
+      "grad_norm": 0.9355807304382324,
+      "learning_rate": 4.539293781952206e-05,
+      "loss": 2.8601,
+      "step": 15500
+    },
+    {
+      "epoch": 1.9022708358102485,
+      "grad_norm": 0.8972137570381165,
+      "learning_rate": 4.524432291047438e-05,
+      "loss": 2.8632,
+      "step": 16000
+    },
+    {
+      "epoch": 1.9617167994293188,
+      "grad_norm": 0.8553013801574707,
+      "learning_rate": 4.5095708001426706e-05,
+      "loss": 2.8696,
+      "step": 16500
+    },
+    {
+      "epoch": 2.021162763048389,
+      "grad_norm": 0.8952363133430481,
+      "learning_rate": 4.494709309237903e-05,
+      "loss": 2.8541,
+      "step": 17000
+    },
+    {
+      "epoch": 2.0806087266674593,
+      "grad_norm": 0.8947279453277588,
+      "learning_rate": 4.479847818333135e-05,
+      "loss": 2.8203,
+      "step": 17500
+    },
+    {
+      "epoch": 2.1400546902865294,
+      "grad_norm": 0.8680304884910583,
+      "learning_rate": 4.4649863274283674e-05,
+      "loss": 2.8088,
+      "step": 18000
+    },
+    {
+      "epoch": 2.1995006539056,
+      "grad_norm": 0.8425644040107727,
+      "learning_rate": 4.4501248365236004e-05,
+      "loss": 2.8064,
+      "step": 18500
+    },
+    {
+      "epoch": 2.25894661752467,
+      "grad_norm": 0.9474213719367981,
+      "learning_rate": 4.4352633456188327e-05,
+      "loss": 2.7851,
+      "step": 19000
+    },
+    {
+      "epoch": 2.3183925811437405,
+      "grad_norm": 0.9292487502098083,
+      "learning_rate": 4.420401854714065e-05,
+      "loss": 2.8062,
+      "step": 19500
+    },
+    {
+      "epoch": 2.3778385447628105,
+      "grad_norm": 0.8527488708496094,
+      "learning_rate": 4.405540363809297e-05,
+      "loss": 2.7851,
+      "step": 20000
+    },
+    {
+      "epoch": 2.437284508381881,
+      "grad_norm": 0.9439261555671692,
+      "learning_rate": 4.39067887290453e-05,
+      "loss": 2.7873,
+      "step": 20500
+    },
+    {
+      "epoch": 2.496730472000951,
+      "grad_norm": 0.9343836903572083,
+      "learning_rate": 4.3758173819997624e-05,
+      "loss": 2.7611,
+      "step": 21000
+    },
+    {
+      "epoch": 2.5561764356200216,
+      "grad_norm": 0.9050599932670593,
+      "learning_rate": 4.360955891094995e-05,
+      "loss": 2.767,
+      "step": 21500
+    },
+    {
+      "epoch": 2.6156223992390917,
+      "grad_norm": 0.9053699374198914,
+      "learning_rate": 4.346094400190227e-05,
+      "loss": 2.7873,
+      "step": 22000
+    },
+    {
+      "epoch": 2.6750683628581617,
+      "grad_norm": 0.9282116293907166,
+      "learning_rate": 4.33123290928546e-05,
+      "loss": 2.7607,
+      "step": 22500
+    },
+    {
+      "epoch": 2.7345143264772322,
+      "grad_norm": 0.9617480635643005,
+      "learning_rate": 4.316371418380692e-05,
+      "loss": 2.7678,
+      "step": 23000
+    },
+    {
+      "epoch": 2.7939602900963023,
+      "grad_norm": 0.9725137948989868,
+      "learning_rate": 4.3015099274759244e-05,
+      "loss": 2.7665,
+      "step": 23500
+    },
+    {
+      "epoch": 2.853406253715373,
+      "grad_norm": 0.9514666199684143,
+      "learning_rate": 4.286648436571157e-05,
+      "loss": 2.7534,
+      "step": 24000
+    },
+    {
+      "epoch": 2.912852217334443,
+      "grad_norm": 0.9485461115837097,
+      "learning_rate": 4.27178694566639e-05,
+      "loss": 2.7306,
+      "step": 24500
+    },
+    {
+      "epoch": 2.9722981809535134,
+      "grad_norm": 1.014106035232544,
+      "learning_rate": 4.256925454761622e-05,
+      "loss": 2.736,
+      "step": 25000
+    },
+    {
+      "epoch": 3.0317441445725835,
+      "grad_norm": 0.9117903113365173,
+      "learning_rate": 4.242063963856854e-05,
+      "loss": 2.7278,
+      "step": 25500
+    },
+    {
+      "epoch": 3.091190108191654,
+      "grad_norm": 0.8904880881309509,
+      "learning_rate": 4.2272024729520865e-05,
+      "loss": 2.7156,
+      "step": 26000
+    },
+    {
+      "epoch": 3.150636071810724,
+      "grad_norm": 0.8653568625450134,
+      "learning_rate": 4.2123409820473194e-05,
+      "loss": 2.7137,
+      "step": 26500
+    },
+    {
+      "epoch": 3.210082035429794,
+      "grad_norm": 0.9386480450630188,
+      "learning_rate": 4.197479491142551e-05,
+      "loss": 2.7021,
+      "step": 27000
+    },
+    {
+      "epoch": 3.2695279990488646,
+      "grad_norm": 1.0122427940368652,
+      "learning_rate": 4.182618000237784e-05,
+      "loss": 2.699,
+      "step": 27500
+    },
+    {
+      "epoch": 3.3289739626679347,
+      "grad_norm": 0.9319558143615723,
+      "learning_rate": 4.167756509333017e-05,
+      "loss": 2.689,
+      "step": 28000
+    },
+    {
+      "epoch": 3.388419926287005,
+      "grad_norm": 0.9281746745109558,
+      "learning_rate": 4.152895018428249e-05,
+      "loss": 2.7027,
+      "step": 28500
+    },
+    {
+      "epoch": 3.4478658899060752,
+      "grad_norm": 0.9750462770462036,
+      "learning_rate": 4.1380335275234815e-05,
+      "loss": 2.6947,
+      "step": 29000
+    },
+    {
+      "epoch": 3.5073118535251457,
+      "grad_norm": 0.8887720704078674,
+      "learning_rate": 4.123172036618714e-05,
+      "loss": 2.6864,
+      "step": 29500
+    },
+    {
+      "epoch": 3.566757817144216,
+      "grad_norm": 0.9884176254272461,
+      "learning_rate": 4.108310545713947e-05,
+      "loss": 2.6893,
+      "step": 30000
+    },
+    {
+      "epoch": 3.6262037807632863,
+      "grad_norm": 0.9995080828666687,
+      "learning_rate": 4.093449054809178e-05,
+      "loss": 2.6734,
+      "step": 30500
+    },
+    {
+      "epoch": 3.6856497443823564,
+      "grad_norm": 1.0068608522415161,
+      "learning_rate": 4.078587563904411e-05,
+      "loss": 2.6766,
+      "step": 31000
+    },
+    {
+      "epoch": 3.7450957080014264,
+      "grad_norm": 1.0225422382354736,
+      "learning_rate": 4.0637260729996435e-05,
+      "loss": 2.6757,
+      "step": 31500
+    },
+    {
+      "epoch": 3.804541671620497,
+      "grad_norm": 0.9354658126831055,
+      "learning_rate": 4.0488645820948765e-05,
+      "loss": 2.6593,
+      "step": 32000
+    },
+    {
+      "epoch": 3.8639876352395675,
+      "grad_norm": 0.9209592938423157,
+      "learning_rate": 4.034003091190108e-05,
+      "loss": 2.6547,
+      "step": 32500
+    },
+    {
+      "epoch": 3.9234335988586375,
+      "grad_norm": 0.8945015668869019,
+      "learning_rate": 4.019141600285341e-05,
+      "loss": 2.6719,
+      "step": 33000
+    },
+    {
+      "epoch": 3.9828795624777076,
+      "grad_norm": 0.9823748469352722,
+      "learning_rate": 4.004280109380573e-05,
+      "loss": 2.6781,
+      "step": 33500
+    },
+    {
+      "epoch": 4.042325526096778,
+      "grad_norm": 1.0186822414398193,
+      "learning_rate": 3.989418618475806e-05,
+      "loss": 2.6469,
+      "step": 34000
+    },
+    {
+      "epoch": 4.101771489715849,
+      "grad_norm": 0.9255732297897339,
+      "learning_rate": 3.974557127571038e-05,
+      "loss": 2.6296,
+      "step": 34500
+    },
+    {
+      "epoch": 4.161217453334919,
+      "grad_norm": 1.0235294103622437,
+      "learning_rate": 3.959695636666271e-05,
+      "loss": 2.6358,
+      "step": 35000
+    },
+    {
+      "epoch": 4.220663416953989,
+      "grad_norm": 0.911547064781189,
+      "learning_rate": 3.944834145761503e-05,
+      "loss": 2.6354,
+      "step": 35500
+    },
+    {
+      "epoch": 4.280109380573059,
+      "grad_norm": 1.0124516487121582,
+      "learning_rate": 3.929972654856735e-05,
+      "loss": 2.6416,
+      "step": 36000
+    },
+    {
+      "epoch": 4.33955534419213,
+      "grad_norm": 1.0222316980361938,
+      "learning_rate": 3.9151111639519676e-05,
+      "loss": 2.6188,
+      "step": 36500
+    },
+    {
+      "epoch": 4.3990013078112,
+      "grad_norm": 0.9710135459899902,
+      "learning_rate": 3.9002496730472005e-05,
+      "loss": 2.6228,
+      "step": 37000
+    },
+    {
+      "epoch": 4.45844727143027,
+      "grad_norm": 1.0287182331085205,
+      "learning_rate": 3.885388182142433e-05,
+      "loss": 2.6067,
+      "step": 37500
+    },
+    {
+      "epoch": 4.51789323504934,
+      "grad_norm": 0.9699456095695496,
+      "learning_rate": 3.870526691237665e-05,
+      "loss": 2.6385,
+      "step": 38000
+    },
+    {
+      "epoch": 4.57733919866841,
+      "grad_norm": 0.9066009521484375,
+      "learning_rate": 3.855665200332897e-05,
+      "loss": 2.6284,
+      "step": 38500
+    },
+    {
+      "epoch": 4.636785162287481,
+      "grad_norm": 0.8537769317626953,
+      "learning_rate": 3.84080370942813e-05,
+      "loss": 2.6135,
+      "step": 39000
+    },
+    {
+      "epoch": 4.696231125906551,
+      "grad_norm": 1.0666980743408203,
+      "learning_rate": 3.8259422185233626e-05,
+      "loss": 2.6312,
+      "step": 39500
+    },
+    {
+      "epoch": 4.755677089525621,
+      "grad_norm": 1.0641474723815918,
+      "learning_rate": 3.811080727618595e-05,
+      "loss": 2.6127,
+      "step": 40000
+    },
+    {
+      "epoch": 4.815123053144691,
+      "grad_norm": 1.076323390007019,
+      "learning_rate": 3.796219236713827e-05,
+      "loss": 2.6184,
+      "step": 40500
+    },
+    {
+      "epoch": 4.874569016763762,
+      "grad_norm": 0.8963558077812195,
+      "learning_rate": 3.78135774580906e-05,
+      "loss": 2.6165,
+      "step": 41000
+    },
+    {
+      "epoch": 4.934014980382832,
+      "grad_norm": 0.968908429145813,
+      "learning_rate": 3.766496254904292e-05,
+      "loss": 2.6009,
+      "step": 41500
+    },
+    {
+      "epoch": 4.993460944001902,
+      "grad_norm": 0.9362033605575562,
+      "learning_rate": 3.7516347639995246e-05,
+      "loss": 2.5956,
+      "step": 42000
+    },
+    {
+      "epoch": 5.052906907620972,
+      "grad_norm": 1.1101199388504028,
+      "learning_rate": 3.736773273094757e-05,
+      "loss": 2.5755,
+      "step": 42500
+    },
+    {
+      "epoch": 5.112352871240043,
+      "grad_norm": 1.2178868055343628,
+      "learning_rate": 3.72191178218999e-05,
+      "loss": 2.5724,
+      "step": 43000
+    },
+    {
+      "epoch": 5.171798834859113,
+      "grad_norm": 1.0143418312072754,
+      "learning_rate": 3.707050291285222e-05,
+      "loss": 2.5834,
+      "step": 43500
+    },
+    {
+      "epoch": 5.231244798478183,
+      "grad_norm": 0.9720271825790405,
+      "learning_rate": 3.6921888003804544e-05,
+      "loss": 2.586,
+      "step": 44000
+    },
+    {
+      "epoch": 5.290690762097253,
+      "grad_norm": 0.8847070932388306,
+      "learning_rate": 3.6773273094756866e-05,
+      "loss": 2.5953,
+      "step": 44500
+    },
+    {
+      "epoch": 5.3501367257163235,
+      "grad_norm": 0.9654759764671326,
+      "learning_rate": 3.6624658185709196e-05,
+      "loss": 2.5777,
+      "step": 45000
+    },
+    {
+      "epoch": 5.409582689335394,
+      "grad_norm": 0.9272730350494385,
+      "learning_rate": 3.647604327666151e-05,
+      "loss": 2.5774,
+      "step": 45500
+    },
+    {
+      "epoch": 5.4690286529544645,
+      "grad_norm": 0.9674676656723022,
+      "learning_rate": 3.632742836761384e-05,
+      "loss": 2.5779,
+      "step": 46000
+    },
+    {
+      "epoch": 5.528474616573535,
+      "grad_norm": 1.0238367319107056,
+      "learning_rate": 3.6178813458566164e-05,
+      "loss": 2.5683,
+      "step": 46500
+    },
+    {
+      "epoch": 5.587920580192605,
+      "grad_norm": 1.1663753986358643,
+      "learning_rate": 3.603019854951849e-05,
+      "loss": 2.5802,
+      "step": 47000
+    },
+    {
+      "epoch": 5.647366543811675,
+      "grad_norm": 0.8961432576179504,
+      "learning_rate": 3.588158364047081e-05,
+      "loss": 2.5726,
+      "step": 47500
+    },
+    {
+      "epoch": 5.706812507430746,
+      "grad_norm": 1.1115467548370361,
+      "learning_rate": 3.573296873142314e-05,
+      "loss": 2.5719,
+      "step": 48000
+    },
+    {
+      "epoch": 5.766258471049816,
+      "grad_norm": 1.00434148311615,
+      "learning_rate": 3.558435382237546e-05,
+      "loss": 2.556,
+      "step": 48500
+    },
+    {
+      "epoch": 5.825704434668886,
+      "grad_norm": 1.1120518445968628,
+      "learning_rate": 3.5435738913327784e-05,
+      "loss": 2.5627,
+      "step": 49000
+    },
+    {
+      "epoch": 5.885150398287957,
+      "grad_norm": 0.9611983299255371,
+      "learning_rate": 3.528712400428011e-05,
+      "loss": 2.5568,
+      "step": 49500
+    },
+    {
+      "epoch": 5.944596361907027,
+      "grad_norm": 1.1176481246948242,
+      "learning_rate": 3.5138509095232436e-05,
+      "loss": 2.5634,
+      "step": 50000
+    },
+    {
+      "epoch": 6.004042325526097,
+      "grad_norm": 0.8676426410675049,
+      "learning_rate": 3.498989418618476e-05,
+      "loss": 2.5551,
+      "step": 50500
+    },
+    {
+      "epoch": 6.063488289145167,
+      "grad_norm": 0.8983253240585327,
+      "learning_rate": 3.484127927713708e-05,
+      "loss": 2.5442,
+      "step": 51000
+    },
+    {
+      "epoch": 6.122934252764237,
+      "grad_norm": 0.9558296203613281,
+      "learning_rate": 3.4692664368089405e-05,
+      "loss": 2.5415,
+      "step": 51500
+    },
+    {
+      "epoch": 6.182380216383308,
+      "grad_norm": 1.1759629249572754,
+      "learning_rate": 3.4544049459041734e-05,
+      "loss": 2.5186,
+      "step": 52000
+    },
+    {
+      "epoch": 6.241826180002378,
+      "grad_norm": 1.186232089996338,
+      "learning_rate": 3.439543454999406e-05,
+      "loss": 2.5437,
+      "step": 52500
+    },
+    {
+      "epoch": 6.301272143621448,
+      "grad_norm": 1.1072938442230225,
+      "learning_rate": 3.424681964094638e-05,
+      "loss": 2.5442,
+      "step": 53000
+    },
+    {
+      "epoch": 6.360718107240518,
+      "grad_norm": 1.1854956150054932,
+      "learning_rate": 3.40982047318987e-05,
+      "loss": 2.5265,
+      "step": 53500
+    },
+    {
+      "epoch": 6.420164070859588,
+      "grad_norm": 1.037420392036438,
+      "learning_rate": 3.394958982285103e-05,
+      "loss": 2.5101,
+      "step": 54000
+    },
+    {
+      "epoch": 6.479610034478659,
+      "grad_norm": 1.0414271354675293,
+      "learning_rate": 3.3800974913803354e-05,
+      "loss": 2.5291,
+      "step": 54500
+    },
+    {
+      "epoch": 6.539055998097729,
+      "grad_norm": 0.8827362656593323,
+      "learning_rate": 3.365236000475568e-05,
+      "loss": 2.5187,
+      "step": 55000
+    },
+    {
+      "epoch": 6.598501961716799,
+      "grad_norm": 0.9146121144294739,
+      "learning_rate": 3.3503745095708007e-05,
+      "loss": 2.5234,
+      "step": 55500
+    },
+    {
+      "epoch": 6.657947925335869,
+      "grad_norm": 1.0134857892990112,
+      "learning_rate": 3.335513018666033e-05,
+      "loss": 2.5199,
+      "step": 56000
+    },
+    {
+      "epoch": 6.71739388895494,
+      "grad_norm": 1.1852586269378662,
+      "learning_rate": 3.320651527761265e-05,
+      "loss": 2.5347,
+      "step": 56500
+    },
+    {
+      "epoch": 6.77683985257401,
+      "grad_norm": 1.0739943981170654,
+      "learning_rate": 3.3057900368564975e-05,
+      "loss": 2.5367,
+      "step": 57000
+    },
+    {
+      "epoch": 6.83628581619308,
+      "grad_norm": 0.9880659580230713,
+      "learning_rate": 3.2909285459517304e-05,
+      "loss": 2.5181,
+      "step": 57500
+    },
+    {
+      "epoch": 6.8957317798121505,
+      "grad_norm": 1.0519931316375732,
+      "learning_rate": 3.276067055046963e-05,
+      "loss": 2.5325,
+      "step": 58000
+    },
+    {
+      "epoch": 6.955177743431221,
+      "grad_norm": 0.9463315010070801,
+      "learning_rate": 3.261205564142195e-05,
+      "loss": 2.5384,
+      "step": 58500
+    },
+    {
+      "epoch": 7.0146237070502915,
+      "grad_norm": 0.9906750917434692,
+      "learning_rate": 3.246344073237427e-05,
+      "loss": 2.5374,
+      "step": 59000
+    },
+    {
+      "epoch": 7.0740696706693615,
+      "grad_norm": 0.9740419983863831,
+      "learning_rate": 3.23148258233266e-05,
+      "loss": 2.4919,
+      "step": 59500
+    },
+    {
+      "epoch": 7.133515634288432,
+      "grad_norm": 1.0209646224975586,
+      "learning_rate": 3.2166210914278925e-05,
+      "loss": 2.5065,
+      "step": 60000
+    },
+    {
+      "epoch": 7.192961597907502,
+      "grad_norm": 1.1537789106369019,
+      "learning_rate": 3.201759600523125e-05,
+      "loss": 2.4888,
+      "step": 60500
+    },
+    {
+      "epoch": 7.252407561526573,
+      "grad_norm": 1.0545387268066406,
+      "learning_rate": 3.186898109618357e-05,
+      "loss": 2.5042,
+      "step": 61000
+    },
+    {
+      "epoch": 7.311853525145643,
+      "grad_norm": 0.8990502953529358,
+      "learning_rate": 3.17203661871359e-05,
+      "loss": 2.4956,
+      "step": 61500
+    },
+    {
+      "epoch": 7.371299488764713,
+      "grad_norm": 1.0004386901855469,
+      "learning_rate": 3.1571751278088215e-05,
+      "loss": 2.5096,
+      "step": 62000
+    },
+    {
+      "epoch": 7.430745452383783,
+      "grad_norm": 1.192317008972168,
+      "learning_rate": 3.1423136369040545e-05,
+      "loss": 2.5038,
+      "step": 62500
+    },
+    {
+      "epoch": 7.490191416002854,
+      "grad_norm": 0.9577484726905823,
+      "learning_rate": 3.127452145999287e-05,
+      "loss": 2.5113,
+      "step": 63000
+    },
+    {
+      "epoch": 7.549637379621924,
+      "grad_norm": 0.8835137486457825,
+      "learning_rate": 3.11259065509452e-05,
+      "loss": 2.4939,
+      "step": 63500
+    },
+    {
+      "epoch": 7.609083343240994,
+      "grad_norm": 0.8289955258369446,
+      "learning_rate": 3.097729164189751e-05,
+      "loss": 2.4716,
+      "step": 64000
+    },
+    {
+      "epoch": 7.668529306860064,
+      "grad_norm": 0.9576908349990845,
+      "learning_rate": 3.082867673284984e-05,
+      "loss": 2.5109,
+      "step": 64500
+    },
+    {
+      "epoch": 7.727975270479135,
+      "grad_norm": 0.9045142531394958,
+      "learning_rate": 3.0680061823802165e-05,
+      "loss": 2.4811,
+      "step": 65000
+    },
+    {
+      "epoch": 7.787421234098205,
+      "grad_norm": 1.3150789737701416,
+      "learning_rate": 3.053144691475449e-05,
+      "loss": 2.505,
+      "step": 65500
+    },
+    {
+      "epoch": 7.846867197717275,
+      "grad_norm": 0.9815430641174316,
+      "learning_rate": 3.0382832005706814e-05,
+      "loss": 2.4923,
+      "step": 66000
+    },
+    {
+      "epoch": 7.906313161336345,
+      "grad_norm": 1.0355448722839355,
+      "learning_rate": 3.023421709665914e-05,
+      "loss": 2.4867,
+      "step": 66500
+    },
+    {
+      "epoch": 7.965759124955415,
+      "grad_norm": 1.0244001150131226,
+      "learning_rate": 3.0085602187611463e-05,
+      "loss": 2.4973,
+      "step": 67000
+    },
+    {
+      "epoch": 8.025205088574486,
+      "grad_norm": 1.052660584449768,
+      "learning_rate": 2.993698727856379e-05,
+      "loss": 2.4976,
+      "step": 67500
+    },
+    {
+      "epoch": 8.084651052193555,
+      "grad_norm": 1.1590783596038818,
+      "learning_rate": 2.978837236951611e-05,
+      "loss": 2.4631,
+      "step": 68000
+    },
+    {
+      "epoch": 8.144097015812626,
+      "grad_norm": 0.9065755605697632,
+      "learning_rate": 2.9639757460468438e-05,
+      "loss": 2.4494,
+      "step": 68500
+    },
+    {
+      "epoch": 8.203542979431697,
+      "grad_norm": 0.9562356472015381,
+      "learning_rate": 2.9491142551420757e-05,
+      "loss": 2.4728,
+      "step": 69000
+    },
+    {
+      "epoch": 8.262988943050766,
+      "grad_norm": 0.9509665966033936,
+      "learning_rate": 2.9342527642373087e-05,
+      "loss": 2.4747,
+      "step": 69500
+    },
+    {
+      "epoch": 8.322434906669837,
+      "grad_norm": 0.9384153485298157,
+      "learning_rate": 2.9193912733325406e-05,
+      "loss": 2.4745,
+      "step": 70000
+    },
+    {
+      "epoch": 8.381880870288906,
+      "grad_norm": 0.9459151029586792,
+      "learning_rate": 2.9045297824277735e-05,
+      "loss": 2.476,
+      "step": 70500
+    },
+    {
+      "epoch": 8.441326833907977,
+      "grad_norm": 0.9553677439689636,
+      "learning_rate": 2.8896682915230055e-05,
+      "loss": 2.4753,
+      "step": 71000
+    },
+    {
+      "epoch": 8.500772797527048,
+      "grad_norm": 1.014932632446289,
+      "learning_rate": 2.8748068006182384e-05,
+      "loss": 2.4647,
+      "step": 71500
+    },
+    {
+      "epoch": 8.560218761146118,
+      "grad_norm": 0.990463376045227,
+      "learning_rate": 2.8599453097134704e-05,
+      "loss": 2.4782,
+      "step": 72000
+    },
+    {
+      "epoch": 8.619664724765189,
+      "grad_norm": 0.892906665802002,
+      "learning_rate": 2.845083818808703e-05,
+      "loss": 2.4736,
+      "step": 72500
+    },
+    {
+      "epoch": 8.67911068838426,
+      "grad_norm": 0.9943811297416687,
+      "learning_rate": 2.8302223279039352e-05,
+      "loss": 2.4554,
+      "step": 73000
+    },
+    {
+      "epoch": 8.738556652003329,
+      "grad_norm": 0.9325155019760132,
+      "learning_rate": 2.815360836999168e-05,
+      "loss": 2.4703,
+      "step": 73500
+    },
+    {
+      "epoch": 8.7980026156224,
+      "grad_norm": 0.9389231204986572,
+      "learning_rate": 2.8004993460944e-05,
+      "loss": 2.4727,
+      "step": 74000
+    },
+    {
+      "epoch": 8.857448579241469,
+      "grad_norm": 0.9121980667114258,
+      "learning_rate": 2.7856378551896327e-05,
+      "loss": 2.4533,
+      "step": 74500
+    },
+    {
+      "epoch": 8.91689454286054,
+      "grad_norm": 1.046366572380066,
+      "learning_rate": 2.770776364284865e-05,
+      "loss": 2.4652,
+      "step": 75000
+    },
+    {
+      "epoch": 8.97634050647961,
+      "grad_norm": 1.0157803297042847,
+      "learning_rate": 2.7559148733800976e-05,
+      "loss": 2.4701,
+      "step": 75500
+    },
+    {
+      "epoch": 9.03578647009868,
+      "grad_norm": 1.1012301445007324,
+      "learning_rate": 2.74105338247533e-05,
+      "loss": 2.4491,
+      "step": 76000
+    },
+    {
+      "epoch": 9.09523243371775,
+      "grad_norm": 1.000829815864563,
+      "learning_rate": 2.7261918915705625e-05,
+      "loss": 2.4434,
+      "step": 76500
+    },
+    {
+      "epoch": 9.15467839733682,
+      "grad_norm": 1.028676986694336,
+      "learning_rate": 2.7113304006657948e-05,
+      "loss": 2.4392,
+      "step": 77000
+    },
+    {
+      "epoch": 9.214124360955891,
+      "grad_norm": 1.0821462869644165,
+      "learning_rate": 2.6964689097610274e-05,
+      "loss": 2.4289,
+      "step": 77500
+    },
+    {
+      "epoch": 9.273570324574962,
+      "grad_norm": 0.951738715171814,
+      "learning_rate": 2.6816074188562596e-05,
+      "loss": 2.4437,
+      "step": 78000
+    },
+    {
+      "epoch": 9.333016288194031,
+      "grad_norm": 0.9170756936073303,
+      "learning_rate": 2.6667459279514923e-05,
+      "loss": 2.4507,
+      "step": 78500
+    },
+    {
+      "epoch": 9.392462251813102,
+      "grad_norm": 0.9591684937477112,
+      "learning_rate": 2.6518844370467245e-05,
+      "loss": 2.4584,
+      "step": 79000
+    },
+    {
+      "epoch": 9.451908215432173,
+      "grad_norm": 1.1289016008377075,
+      "learning_rate": 2.637022946141957e-05,
+      "loss": 2.4595,
+      "step": 79500
+    },
+    {
+      "epoch": 9.511354179051242,
+      "grad_norm": 1.0114785432815552,
+      "learning_rate": 2.6221614552371894e-05,
+      "loss": 2.4404,
+      "step": 80000
+    },
+    {
+      "epoch": 9.570800142670313,
+      "grad_norm": 1.1835304498672485,
+      "learning_rate": 2.607299964332422e-05,
+      "loss": 2.4308,
+      "step": 80500
+    },
+    {
+      "epoch": 9.630246106289382,
+      "grad_norm": 0.9822309017181396,
+      "learning_rate": 2.592438473427654e-05,
+      "loss": 2.4387,
+      "step": 81000
+    },
+    {
+      "epoch": 9.689692069908453,
+      "grad_norm": 1.114311695098877,
+      "learning_rate": 2.577576982522887e-05,
+      "loss": 2.4519,
+      "step": 81500
+    },
+    {
+      "epoch": 9.749138033527524,
+      "grad_norm": 1.1047866344451904,
+      "learning_rate": 2.5627154916181195e-05,
+      "loss": 2.4497,
+      "step": 82000
+    },
+    {
+      "epoch": 9.808583997146593,
+      "grad_norm": 0.9930892586708069,
+      "learning_rate": 2.5478540007133518e-05,
+      "loss": 2.4489,
+      "step": 82500
+    },
+    {
+      "epoch": 9.868029960765664,
+      "grad_norm": 1.1107361316680908,
+      "learning_rate": 2.5329925098085844e-05,
+      "loss": 2.4399,
+      "step": 83000
+    },
+    {
+      "epoch": 9.927475924384733,
+      "grad_norm": 1.0770343542099,
+      "learning_rate": 2.5181310189038167e-05,
+      "loss": 2.4362,
+      "step": 83500
+    },
+    {
+      "epoch": 9.986921888003804,
+      "grad_norm": 0.9818819761276245,
+      "learning_rate": 2.5032695279990493e-05,
+      "loss": 2.4418,
+      "step": 84000
+    },
+    {
+      "epoch": 10.046367851622875,
+      "grad_norm": 1.1135622262954712,
+      "learning_rate": 2.4884080370942815e-05,
+      "loss": 2.428,
+      "step": 84500
+    },
+    {
+      "epoch": 10.105813815241945,
+      "grad_norm": 1.035888671875,
+      "learning_rate": 2.4735465461895138e-05,
+      "loss": 2.4193,
+      "step": 85000
+    },
+    {
+      "epoch": 10.165259778861016,
+      "grad_norm": 0.9694905281066895,
+      "learning_rate": 2.458685055284746e-05,
+      "loss": 2.4165,
+      "step": 85500
+    },
+    {
+      "epoch": 10.224705742480086,
+      "grad_norm": 1.116449236869812,
+      "learning_rate": 2.4438235643799787e-05,
+      "loss": 2.4122,
+      "step": 86000
+    },
+    {
+      "epoch": 10.284151706099156,
+      "grad_norm": 0.9860423803329468,
+      "learning_rate": 2.428962073475211e-05,
+      "loss": 2.4173,
+      "step": 86500
+    },
+    {
+      "epoch": 10.343597669718227,
+      "grad_norm": 1.1727473735809326,
+      "learning_rate": 2.4141005825704436e-05,
+      "loss": 2.4258,
+      "step": 87000
+    },
+    {
+      "epoch": 10.403043633337296,
+      "grad_norm": 1.0731017589569092,
+      "learning_rate": 2.399239091665676e-05,
+      "loss": 2.4289,
+      "step": 87500
+    },
+    {
+      "epoch": 10.462489596956367,
+      "grad_norm": 1.0740883350372314,
+      "learning_rate": 2.3843776007609085e-05,
+      "loss": 2.4142,
+      "step": 88000
+    },
+    {
+      "epoch": 10.521935560575438,
+      "grad_norm": 1.1342713832855225,
+      "learning_rate": 2.3695161098561407e-05,
+      "loss": 2.4315,
+      "step": 88500
+    },
+    {
+      "epoch": 10.581381524194507,
+      "grad_norm": 1.0230334997177124,
+      "learning_rate": 2.3546546189513733e-05,
+      "loss": 2.4352,
+      "step": 89000
+    },
+    {
+      "epoch": 10.640827487813578,
+      "grad_norm": 1.0113749504089355,
+      "learning_rate": 2.3397931280466056e-05,
+      "loss": 2.4128,
+      "step": 89500
+    },
+    {
+      "epoch": 10.700273451432647,
+      "grad_norm": 1.0363703966140747,
+      "learning_rate": 2.3249316371418382e-05,
+      "loss": 2.4343,
+      "step": 90000
+    },
+    {
+      "epoch": 10.759719415051718,
+      "grad_norm": 1.0065736770629883,
+      "learning_rate": 2.3100701462370705e-05,
+      "loss": 2.4268,
+      "step": 90500
+    },
+    {
+      "epoch": 10.819165378670789,
+      "grad_norm": 0.949798047542572,
+      "learning_rate": 2.295208655332303e-05,
+      "loss": 2.4114,
+      "step": 91000
+    },
+    {
+      "epoch": 10.878611342289858,
+      "grad_norm": 0.9772433042526245,
+      "learning_rate": 2.2803471644275354e-05,
+      "loss": 2.4187,
+      "step": 91500
+    },
+    {
+      "epoch": 10.938057305908929,
+      "grad_norm": 0.9436720609664917,
+      "learning_rate": 2.2654856735227677e-05,
+      "loss": 2.4151,
+      "step": 92000
+    },
+    {
+      "epoch": 10.997503269528,
+      "grad_norm": 0.9903433918952942,
+      "learning_rate": 2.2506241826180003e-05,
+      "loss": 2.4332,
+      "step": 92500
+    },
+    {
+      "epoch": 11.05694923314707,
+      "grad_norm": 0.9285963177680969,
+      "learning_rate": 2.2357626917132325e-05,
+      "loss": 2.3895,
+      "step": 93000
+    },
+    {
+      "epoch": 11.11639519676614,
+      "grad_norm": 1.0996205806732178,
+      "learning_rate": 2.220901200808465e-05,
+      "loss": 2.3858,
+      "step": 93500
+    },
+    {
+      "epoch": 11.17584116038521,
+      "grad_norm": 0.9550360441207886,
+      "learning_rate": 2.2060397099036974e-05,
+      "loss": 2.4016,
+      "step": 94000
+    },
+    {
+      "epoch": 11.23528712400428,
+      "grad_norm": 1.3018606901168823,
+      "learning_rate": 2.19117821899893e-05,
+      "loss": 2.4031,
+      "step": 94500
+    },
+    {
+      "epoch": 11.294733087623351,
+      "grad_norm": 0.9388914704322815,
+      "learning_rate": 2.1763167280941626e-05,
+      "loss": 2.4094,
+      "step": 95000
+    },
+    {
+      "epoch": 11.35417905124242,
+      "grad_norm": 0.9850655794143677,
+      "learning_rate": 2.161455237189395e-05,
+      "loss": 2.4054,
+      "step": 95500
+    },
+    {
+      "epoch": 11.413625014861491,
+      "grad_norm": 1.038522481918335,
+      "learning_rate": 2.1465937462846275e-05,
+      "loss": 2.3895,
+      "step": 96000
+    },
+    {
+      "epoch": 11.47307097848056,
+      "grad_norm": 1.0989197492599487,
+      "learning_rate": 2.1317322553798598e-05,
+      "loss": 2.4019,
+      "step": 96500
+    },
+    {
+      "epoch": 11.532516942099631,
+      "grad_norm": 1.0527700185775757,
+      "learning_rate": 2.1168707644750924e-05,
+      "loss": 2.399,
+      "step": 97000
+    },
+    {
+      "epoch": 11.591962905718702,
+      "grad_norm": 1.273655652999878,
+      "learning_rate": 2.1020092735703247e-05,
+      "loss": 2.4259,
+      "step": 97500
+    },
+    {
+      "epoch": 11.651408869337772,
+      "grad_norm": 1.002064824104309,
+      "learning_rate": 2.0871477826655573e-05,
+      "loss": 2.4073,
+      "step": 98000
+    },
+    {
+      "epoch": 11.710854832956842,
+      "grad_norm": 0.9922045469284058,
+      "learning_rate": 2.0722862917607896e-05,
+      "loss": 2.4059,
+      "step": 98500
+    },
+    {
+      "epoch": 11.770300796575913,
+      "grad_norm": 0.9962035417556763,
+      "learning_rate": 2.057424800856022e-05,
+      "loss": 2.4174,
+      "step": 99000
+    },
+    {
+      "epoch": 11.829746760194983,
+      "grad_norm": 1.0998961925506592,
+      "learning_rate": 2.0425633099512544e-05,
+      "loss": 2.4133,
+      "step": 99500
+    },
+    {
+      "epoch": 11.889192723814054,
+      "grad_norm": 1.0380686521530151,
+      "learning_rate": 2.027701819046487e-05,
+      "loss": 2.414,
+      "step": 100000
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 168220,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 500,
+  "total_flos": 4.1803850907648e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7163e8e93997d9542b4813116cd402aa38eade3ba35ae0069435235054002545
+size 4920

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff