initial commit

Files changed (11) hide show

config.json +68 -0
generation_config.json +16 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
trainer_state.json +1330 -0
training_args.bin +3 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "_name_or_path": "facebook/bart-large-cnn",
+  "_num_labels": 3,
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "add_final_layer_norm": false,
+  "architectures": ["BartForConditionalGeneration"],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "force_bos_token_to_be_generated": true,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "length_penalty": 2.0,
+  "max_length": 142,
+  "max_position_embeddings": 1024,
+  "min_length": 56,
+  "model_type": "bart",
+  "no_repeat_ngram_size": 3,
+  "normalize_before": false,
+  "num_beams": 4,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "prefix": " ",
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "vocab_size": 50264
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "early_stopping": true,
+  "eos_token_id": 2,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "length_penalty": 2.0,
+  "max_length": 142,
+  "min_length": 56,
+  "no_repeat_ngram_size": 3,
+  "num_beams": 4,
+  "pad_token_id": 1,
+  "transformers_version": "4.39.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb98f4ef30260cf3dae3226ddcb4f72d2b8275b7a7cca81cbfac92e6ac0e599d
+size 1625422896

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a79d260ad25ff60576a3b81ee9173c9d396400c2b5894b5a7ff7745c883095b
+size 3250751759

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:375e35d892a6fa01592c9545b1e6b60098a97aec2909cd43f820c0cd58a54ae5
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:505e140348cf428a705b42cb8aafd791b8d0b0ac55c70e82b985098cfa08a46a
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1330 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 18750,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03,
+      "grad_norm": 13.29955005645752,
+      "learning_rate": 1e-5,
+      "loss": 3.9042,
+      "step": 100
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.7849366664886475,
+      "learning_rate": 2e-5,
+      "loss": 0.6647,
+      "step": 200
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.459277868270874,
+      "learning_rate": 3e-5,
+      "loss": 0.5227,
+      "step": 300
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.3880829811096191,
+      "learning_rate": 4e-5,
+      "loss": 0.5149,
+      "step": 400
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.3338109254837036,
+      "learning_rate": 5e-5,
+      "loss": 0.4984,
+      "step": 500
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.065587043762207,
+      "learning_rate": 4.972602739726028e-5,
+      "loss": 0.4847,
+      "step": 600
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.0945565700531006,
+      "learning_rate": 4.945205479452055e-5,
+      "loss": 0.4615,
+      "step": 700
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.0960415601730347,
+      "learning_rate": 4.917808219178082e-5,
+      "loss": 0.4504,
+      "step": 800
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.0467826128005981,
+      "learning_rate": 4.89041095890411e-5,
+      "loss": 0.4415,
+      "step": 900
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.0690410137176514,
+      "learning_rate": 4.863013698630137e-5,
+      "loss": 0.4497,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.9305112957954407,
+      "learning_rate": 4.835616438356165e-5,
+      "loss": 0.4363,
+      "step": 1100
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8500822186470032,
+      "learning_rate": 4.808219178082192e-5,
+      "loss": 0.4306,
+      "step": 1200
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.2137833833694458,
+      "learning_rate": 4.780821917808219e-5,
+      "loss": 0.423,
+      "step": 1300
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.9352328777313232,
+      "learning_rate": 4.753424657534247e-5,
+      "loss": 0.4161,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.1027398109436035,
+      "learning_rate": 4.726027397260274e-5,
+      "loss": 0.4083,
+      "step": 1500
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.924199104309082,
+      "learning_rate": 4.698630136986302e-5,
+      "loss": 0.4254,
+      "step": 1600
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.8912659883499146,
+      "learning_rate": 4.671232876712329e-5,
+      "loss": 0.3918,
+      "step": 1700
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.0025393962860107,
+      "learning_rate": 4.643835616438356e-5,
+      "loss": 0.4046,
+      "step": 1800
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.9362453818321228,
+      "learning_rate": 4.616438356164384e-5,
+      "loss": 0.3979,
+      "step": 1900
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.8841680884361267,
+      "learning_rate": 4.589041095890411e-5,
+      "loss": 0.3938,
+      "step": 2000
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.898572564125061,
+      "learning_rate": 4.561643835616439e-5,
+      "loss": 0.3974,
+      "step": 2100
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.9011989831924438,
+      "learning_rate": 4.534246575342466e-5,
+      "loss": 0.3955,
+      "step": 2200
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.913512110710144,
+      "learning_rate": 4.506849315068493e-5,
+      "loss": 0.3842,
+      "step": 2300
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9077229499816895,
+      "learning_rate": 4.479452054794521e-5,
+      "loss": 0.3756,
+      "step": 2400
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.8107369542121887,
+      "learning_rate": 4.452054794520548e-5,
+      "loss": 0.3781,
+      "step": 2500
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.962982714176178,
+      "learning_rate": 4.424657534246576e-5,
+      "loss": 0.3728,
+      "step": 2600
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9333651065826416,
+      "learning_rate": 4.3972602739726035e-5,
+      "loss": 0.3731,
+      "step": 2700
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.9969388246536255,
+      "learning_rate": 4.36986301369863e-5,
+      "loss": 0.3793,
+      "step": 2800
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.8959200978279114,
+      "learning_rate": 4.342465753424658e-5,
+      "loss": 0.3787,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8185614943504333,
+      "learning_rate": 4.3150684931506855e-5,
+      "loss": 0.3626,
+      "step": 3000
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.1243007183074951,
+      "learning_rate": 4.2876712328767126e-5,
+      "loss": 0.3681,
+      "step": 3100
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.0251150131225586,
+      "learning_rate": 4.2602739726027404e-5,
+      "loss": 0.3609,
+      "step": 3200
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.8459119200706482,
+      "learning_rate": 4.232876712328767e-5,
+      "loss": 0.3608,
+      "step": 3300
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.9018300175666809,
+      "learning_rate": 4.2054794520547946e-5,
+      "loss": 0.3779,
+      "step": 3400
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.804707407951355,
+      "learning_rate": 4.1780821917808224e-5,
+      "loss": 0.3674,
+      "step": 3500
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8781819343566895,
+      "learning_rate": 4.1506849315068495e-5,
+      "loss": 0.3635,
+      "step": 3600
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.9687257409095764,
+      "learning_rate": 4.123287671232877e-5,
+      "loss": 0.3617,
+      "step": 3700
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.7628118991851807,
+      "learning_rate": 4.0958904109589044e-5,
+      "loss": 0.3352,
+      "step": 3800
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.9802232980728149,
+      "learning_rate": 4.0684931506849315e-5,
+      "loss": 0.3244,
+      "step": 3900
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.7366902828216553,
+      "learning_rate": 4.041095890410959e-5,
+      "loss": 0.3237,
+      "step": 4000
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.8844860196113586,
+      "learning_rate": 4.0136986301369864e-5,
+      "loss": 0.3296,
+      "step": 4100
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.692650556564331,
+      "learning_rate": 3.9863013698630135e-5,
+      "loss": 0.3165,
+      "step": 4200
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.8171700239181519,
+      "learning_rate": 3.958904109589041e-5,
+      "loss": 0.323,
+      "step": 4300
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.9350169897079468,
+      "learning_rate": 3.9315068493150684e-5,
+      "loss": 0.3259,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.9551327228546143,
+      "learning_rate": 3.904109589041096e-5,
+      "loss": 0.3252,
+      "step": 4500
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.8646096587181091,
+      "learning_rate": 3.8767123287671233e-5,
+      "loss": 0.3267,
+      "step": 4600
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.8012389540672302,
+      "learning_rate": 3.8493150684931505e-5,
+      "loss": 0.3149,
+      "step": 4700
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.833848774433136,
+      "learning_rate": 3.821917808219178e-5,
+      "loss": 0.3164,
+      "step": 4800
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.7836089730262756,
+      "learning_rate": 3.7945205479452054e-5,
+      "loss": 0.3206,
+      "step": 4900
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.8694811463356018,
+      "learning_rate": 3.767123287671233e-5,
+      "loss": 0.3187,
+      "step": 5000
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.8749567866325378,
+      "learning_rate": 3.739726027397261e-5,
+      "loss": 0.3165,
+      "step": 5100
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.8689484596252441,
+      "learning_rate": 3.7123287671232874e-5,
+      "loss": 0.3154,
+      "step": 5200
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.8809706568717957,
+      "learning_rate": 3.684931506849315e-5,
+      "loss": 0.3301,
+      "step": 5300
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.8677769899368286,
+      "learning_rate": 3.657534246575342e-5,
+      "loss": 0.3184,
+      "step": 5400
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.8212382793426514,
+      "learning_rate": 3.63013698630137e-5,
+      "loss": 0.3181,
+      "step": 5500
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.8636347651481628,
+      "learning_rate": 3.602739726027398e-5,
+      "loss": 0.3138,
+      "step": 5600
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.8136293292045593,
+      "learning_rate": 3.575342465753424e-5,
+      "loss": 0.3156,
+      "step": 5700
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.7700251936912537,
+      "learning_rate": 3.547945205479452e-5,
+      "loss": 0.3179,
+      "step": 5800
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.7282480597496033,
+      "learning_rate": 3.52054794520548e-5,
+      "loss": 0.3188,
+      "step": 5900
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.7657186388969421,
+      "learning_rate": 3.493150684931507e-5,
+      "loss": 0.3137,
+      "step": 6000
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.8558144569396973,
+      "learning_rate": 3.465753424657535e-5,
+      "loss": 0.3192,
+      "step": 6100
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.7496147751808167,
+      "learning_rate": 3.438356164383562e-5,
+      "loss": 0.3175,
+      "step": 6200
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.9365683794021606,
+      "learning_rate": 3.410958904109589e-5,
+      "loss": 0.3124,
+      "step": 6300
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.8127835392951965,
+      "learning_rate": 3.383561643835617e-5,
+      "loss": 0.3056,
+      "step": 6400
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 0.819684624671936,
+      "learning_rate": 3.356164383561644e-5,
+      "loss": 0.3144,
+      "step": 6500
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.7603724598884583,
+      "learning_rate": 3.328767123287672e-5,
+      "loss": 0.315,
+      "step": 6600
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.8054817318916321,
+      "learning_rate": 3.301369863013699e-5,
+      "loss": 0.3073,
+      "step": 6700
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.758423924446106,
+      "learning_rate": 3.273972602739726e-5,
+      "loss": 0.312,
+      "step": 6800
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.8245046138763428,
+      "learning_rate": 3.246575342465754e-5,
+      "loss": 0.3125,
+      "step": 6900
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.7906696796417236,
+      "learning_rate": 3.219178082191781e-5,
+      "loss": 0.3009,
+      "step": 7000
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.8566040992736816,
+      "learning_rate": 3.1917808219178086e-5,
+      "loss": 0.3043,
+      "step": 7100
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.7341597080230713,
+      "learning_rate": 3.164383561643836e-5,
+      "loss": 0.309,
+      "step": 7200
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.7561280131340027,
+      "learning_rate": 3.136986301369863e-5,
+      "loss": 0.3051,
+      "step": 7300
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.7900431156158447,
+      "learning_rate": 3.1095890410958906e-5,
+      "loss": 0.3093,
+      "step": 7400
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.880424976348877,
+      "learning_rate": 3.082191780821918e-5,
+      "loss": 0.3058,
+      "step": 7500
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 0.8830358982086182,
+      "learning_rate": 3.0547945205479455e-5,
+      "loss": 0.2673,
+      "step": 7600
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 0.6983394026756287,
+      "learning_rate": 3.0273972602739726e-5,
+      "loss": 0.2739,
+      "step": 7700
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.8467246890068054,
+      "learning_rate": 3e-5,
+      "loss": 0.2694,
+      "step": 7800
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 0.8425388932228088,
+      "learning_rate": 2.9726027397260275e-5,
+      "loss": 0.2698,
+      "step": 7900
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 0.6956115365028381,
+      "learning_rate": 2.945205479452055e-5,
+      "loss": 0.2616,
+      "step": 8000
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.9649244546890259,
+      "learning_rate": 2.9178082191780824e-5,
+      "loss": 0.2763,
+      "step": 8100
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.7081593871116638,
+      "learning_rate": 2.8904109589041095e-5,
+      "loss": 0.2683,
+      "step": 8200
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 0.9411781430244446,
+      "learning_rate": 2.863013698630137e-5,
+      "loss": 0.2621,
+      "step": 8300
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.8201924562454224,
+      "learning_rate": 2.8356164383561644e-5,
+      "loss": 0.2701,
+      "step": 8400
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 0.8518856167793274,
+      "learning_rate": 2.808219178082192e-5,
+      "loss": 0.272,
+      "step": 8500
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 0.8004194498062134,
+      "learning_rate": 2.7808219178082197e-5,
+      "loss": 0.267,
+      "step": 8600
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.9312605857849121,
+      "learning_rate": 2.7534246575342465e-5,
+      "loss": 0.2632,
+      "step": 8700
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.8414776921272278,
+      "learning_rate": 2.726027397260274e-5,
+      "loss": 0.2681,
+      "step": 8800
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 0.6925989985466003,
+      "learning_rate": 2.6986301369863014e-5,
+      "loss": 0.2668,
+      "step": 8900
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.9184579849243164,
+      "learning_rate": 2.671232876712329e-5,
+      "loss": 0.2673,
+      "step": 9000
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 1.1033433675765991,
+      "learning_rate": 2.6438356164383566e-5,
+      "loss": 0.2684,
+      "step": 9100
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 0.9113504886627197,
+      "learning_rate": 2.6164383561643834e-5,
+      "loss": 0.2644,
+      "step": 9200
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.7905146479606628,
+      "learning_rate": 2.589041095890411e-5,
+      "loss": 0.2668,
+      "step": 9300
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 0.6717493534088135,
+      "learning_rate": 2.5616438356164386e-5,
+      "loss": 0.271,
+      "step": 9400
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 0.8438414335250854,
+      "learning_rate": 2.534246575342466e-5,
+      "loss": 0.2706,
+      "step": 9500
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.8165556192398071,
+      "learning_rate": 2.5068493150684935e-5,
+      "loss": 0.2603,
+      "step": 9600
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 0.8030436038970947,
+      "learning_rate": 2.4794520547945206e-5,
+      "loss": 0.2587,
+      "step": 9700
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.8518214225769043,
+      "learning_rate": 2.452054794520548e-5,
+      "loss": 0.2533,
+      "step": 9800
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.9882023930549622,
+      "learning_rate": 2.4246575342465755e-5,
+      "loss": 0.2561,
+      "step": 9900
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 0.8175749182701111,
+      "learning_rate": 2.3972602739726026e-5,
+      "loss": 0.2572,
+      "step": 10000
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.897048830986023,
+      "learning_rate": 2.36986301369863e-5,
+      "loss": 0.2587,
+      "step": 10100
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 0.8218054175376892,
+      "learning_rate": 2.342465753424658e-5,
+      "loss": 0.2654,
+      "step": 10200
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.7128798961639404,
+      "learning_rate": 2.315068493150685e-5,
+      "loss": 0.2642,
+      "step": 10300
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 0.7982375621795654,
+      "learning_rate": 2.2876712328767124e-5,
+      "loss": 0.2537,
+      "step": 10400
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.790105938911438,
+      "learning_rate": 2.2602739726027396e-5,
+      "loss": 0.2713,
+      "step": 10500
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 0.7734562158584595,
+      "learning_rate": 2.2328767123287673e-5,
+      "loss": 0.2616,
+      "step": 10600
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.8464659452438354,
+      "learning_rate": 2.2054794520547948e-5,
+      "loss": 0.2584,
+      "step": 10700
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.7386855483055115,
+      "learning_rate": 2.178082191780822e-5,
+      "loss": 0.257,
+      "step": 10800
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 0.7122279405593872,
+      "learning_rate": 2.1506849315068494e-5,
+      "loss": 0.2667,
+      "step": 10900
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.8505749106407166,
+      "learning_rate": 2.1232876712328768e-5,
+      "loss": 0.2661,
+      "step": 11000
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.8915577530860901,
+      "learning_rate": 2.0958904109589043e-5,
+      "loss": 0.2567,
+      "step": 11100
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 0.9431042671203613,
+      "learning_rate": 2.0684931506849317e-5,
+      "loss": 0.2578,
+      "step": 11200
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 0.7943726181983948,
+      "learning_rate": 2.0410958904109588e-5,
+      "loss": 0.2393,
+      "step": 11300
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.8244442939758301,
+      "learning_rate": 2.0136986301369866e-5,
+      "loss": 0.2175,
+      "step": 11400
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 0.7802647948265076,
+      "learning_rate": 1.9863013698630137e-5,
+      "loss": 0.2161,
+      "step": 11500
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 1.1162070035934448,
+      "learning_rate": 1.9589041095890412e-5,
+      "loss": 0.2211,
+      "step": 11600
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 1.0273113250732422,
+      "learning_rate": 1.9315068493150686e-5,
+      "loss": 0.2253,
+      "step": 11700
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.0477781295776367,
+      "learning_rate": 1.904109589041096e-5,
+      "loss": 0.2213,
+      "step": 11800
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.9134103655815125,
+      "learning_rate": 1.8767123287671235e-5,
+      "loss": 0.2269,
+      "step": 11900
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.8156262636184692,
+      "learning_rate": 1.8493150684931506e-5,
+      "loss": 0.2245,
+      "step": 12000
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 0.9004743695259094,
+      "learning_rate": 1.821917808219178e-5,
+      "loss": 0.2254,
+      "step": 12100
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.8386040925979614,
+      "learning_rate": 1.7945205479452055e-5,
+      "loss": 0.2292,
+      "step": 12200
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 0.9777556657791138,
+      "learning_rate": 1.767123287671233e-5,
+      "loss": 0.2213,
+      "step": 12300
+    },
+    {
+      "epoch": 3.31,
+      "grad_norm": 0.7827901244163513,
+      "learning_rate": 1.7397260273972604e-5,
+      "loss": 0.2174,
+      "step": 12400
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 0.7424948811531067,
+      "learning_rate": 1.7123287671232875e-5,
+      "loss": 0.2199,
+      "step": 12500
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 0.8807641267776489,
+      "learning_rate": 1.684931506849315e-5,
+      "loss": 0.2204,
+      "step": 12600
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 0.8479088544845581,
+      "learning_rate": 1.6575342465753428e-5,
+      "loss": 0.2241,
+      "step": 12700
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 0.9211342334747314,
+      "learning_rate": 1.63013698630137e-5,
+      "loss": 0.2237,
+      "step": 12800
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.8683446645736694,
+      "learning_rate": 1.6027397260273974e-5,
+      "loss": 0.2248,
+      "step": 12900
+    },
+    {
+      "epoch": 3.47,
+      "grad_norm": 0.8828756213188171,
+      "learning_rate": 1.5753424657534248e-5,
+      "loss": 0.233,
+      "step": 13000
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 0.9421214461326599,
+      "learning_rate": 1.5479452054794523e-5,
+      "loss": 0.2294,
+      "step": 13100
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 0.765132486820221,
+      "learning_rate": 1.5205479452054797e-5,
+      "loss": 0.2277,
+      "step": 13200
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 0.9406650066375732,
+      "learning_rate": 1.4931506849315068e-5,
+      "loss": 0.217,
+      "step": 13300
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 1.0174639225006104,
+      "learning_rate": 1.4657534246575344e-5,
+      "loss": 0.2265,
+      "step": 13400
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.826392412185669,
+      "learning_rate": 1.4383561643835617e-5,
+      "loss": 0.222,
+      "step": 13500
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 0.9821271300315857,
+      "learning_rate": 1.4109589041095892e-5,
+      "loss": 0.2186,
+      "step": 13600
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 0.8172212839126587,
+      "learning_rate": 1.3835616438356164e-5,
+      "loss": 0.2238,
+      "step": 13700
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 0.8128436207771301,
+      "learning_rate": 1.3561643835616439e-5,
+      "loss": 0.2168,
+      "step": 13800
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 0.8061575293540955,
+      "learning_rate": 1.3287671232876714e-5,
+      "loss": 0.2244,
+      "step": 13900
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 0.8976914882659912,
+      "learning_rate": 1.3013698630136986e-5,
+      "loss": 0.2212,
+      "step": 14000
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.9973928332328796,
+      "learning_rate": 1.273972602739726e-5,
+      "loss": 0.2248,
+      "step": 14100
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 0.8042004108428955,
+      "learning_rate": 1.2465753424657535e-5,
+      "loss": 0.2178,
+      "step": 14200
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 0.8282990455627441,
+      "learning_rate": 1.2191780821917808e-5,
+      "loss": 0.2227,
+      "step": 14300
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.6668768525123596,
+      "learning_rate": 1.1917808219178083e-5,
+      "loss": 0.2226,
+      "step": 14400
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 0.7972692847251892,
+      "learning_rate": 1.1643835616438355e-5,
+      "loss": 0.2193,
+      "step": 14500
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 0.7637550830841064,
+      "learning_rate": 1.1369863013698632e-5,
+      "loss": 0.2157,
+      "step": 14600
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.8487162590026855,
+      "learning_rate": 1.1095890410958904e-5,
+      "loss": 0.2251,
+      "step": 14700
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 0.8710606694221497,
+      "learning_rate": 1.0821917808219179e-5,
+      "loss": 0.2153,
+      "step": 14800
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 0.8085966110229492,
+      "learning_rate": 1.0547945205479452e-5,
+      "loss": 0.2191,
+      "step": 14900
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.94338059425354,
+      "learning_rate": 1.0273972602739726e-5,
+      "loss": 0.2184,
+      "step": 15000
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 1.4945096969604492,
+      "learning_rate": 1e-5,
+      "loss": 0.1863,
+      "step": 15100
+    },
+    {
+      "epoch": 4.05,
+      "grad_norm": 0.9178032279014587,
+      "learning_rate": 9.726027397260275e-6,
+      "loss": 0.1854,
+      "step": 15200
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 0.8616482615470886,
+      "learning_rate": 9.452054794520548e-6,
+      "loss": 0.1843,
+      "step": 15300
+    },
+    {
+      "epoch": 4.11,
+      "grad_norm": 0.9844592213630676,
+      "learning_rate": 9.178082191780823e-6,
+      "loss": 0.1909,
+      "step": 15400
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 0.7312936186790466,
+      "learning_rate": 8.904109589041095e-6,
+      "loss": 0.1899,
+      "step": 15500
+    },
+    {
+      "epoch": 4.16,
+      "grad_norm": 0.9658412933349609,
+      "learning_rate": 8.630136986301372e-6,
+      "loss": 0.1878,
+      "step": 15600
+    },
+    {
+      "epoch": 4.19,
+      "grad_norm": 1.0498002767562866,
+      "learning_rate": 8.356164383561644e-6,
+      "loss": 0.1825,
+      "step": 15700
+    },
+    {
+      "epoch": 4.21,
+      "grad_norm": 0.7098029255867004,
+      "learning_rate": 8.082191780821919e-6,
+      "loss": 0.1864,
+      "step": 15800
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 0.9946851134300232,
+      "learning_rate": 7.808219178082192e-6,
+      "loss": 0.1852,
+      "step": 15900
+    },
+    {
+      "epoch": 4.27,
+      "grad_norm": 0.9338549375534058,
+      "learning_rate": 7.5342465753424655e-6,
+      "loss": 0.1865,
+      "step": 16000
+    },
+    {
+      "epoch": 4.29,
+      "grad_norm": 0.8193784952163696,
+      "learning_rate": 7.260273972602739e-6,
+      "loss": 0.184,
+      "step": 16100
+    },
+    {
+      "epoch": 4.32,
+      "grad_norm": 0.9323195815086365,
+      "learning_rate": 6.9863013698630145e-6,
+      "loss": 0.1845,
+      "step": 16200
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 0.9668224453926086,
+      "learning_rate": 6.712328767123288e-6,
+      "loss": 0.1911,
+      "step": 16300
+    },
+    {
+      "epoch": 4.37,
+      "grad_norm": 0.9941351413726807,
+      "learning_rate": 6.438356164383562e-6,
+      "loss": 0.1859,
+      "step": 16400
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.9229924082756042,
+      "learning_rate": 6.1643835616438354e-6,
+      "loss": 0.1861,
+      "step": 16500
+    },
+    {
+      "epoch": 4.43,
+      "grad_norm": 0.8792287111282349,
+      "learning_rate": 5.89041095890411e-6,
+      "loss": 0.1903,
+      "step": 16600
+    },
+    {
+      "epoch": 4.45,
+      "grad_norm": 0.682725191116333,
+      "learning_rate": 5.616438356164384e-6,
+      "loss": 0.1822,
+      "step": 16700
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 0.8012785315513611,
+      "learning_rate": 5.342465753424658e-6,
+      "loss": 0.1888,
+      "step": 16800
+    },
+    {
+      "epoch": 4.51,
+      "grad_norm": 0.7928184270858765,
+      "learning_rate": 5.068493150684932e-6,
+      "loss": 0.1869,
+      "step": 16900
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 1.2073571681976318,
+      "learning_rate": 4.7945205479452054e-6,
+      "loss": 0.184,
+      "step": 17000
+    },
+    {
+      "epoch": 4.56,
+      "grad_norm": 0.763810396194458,
+      "learning_rate": 4.52054794520548e-6,
+      "loss": 0.1824,
+      "step": 17100
+    },
+    {
+      "epoch": 4.59,
+      "grad_norm": 0.8932220935821533,
+      "learning_rate": 4.246575342465754e-6,
+      "loss": 0.1898,
+      "step": 17200
+    },
+    {
+      "epoch": 4.61,
+      "grad_norm": 0.7250128984451294,
+      "learning_rate": 3.972602739726028e-6,
+      "loss": 0.1886,
+      "step": 17300
+    },
+    {
+      "epoch": 4.64,
+      "grad_norm": 1.0617702007293701,
+      "learning_rate": 3.6986301369863018e-6,
+      "loss": 0.1889,
+      "step": 17400
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 0.983672022819519,
+      "learning_rate": 3.4246575342465754e-6,
+      "loss": 0.1871,
+      "step": 17500
+    },
+    {
+      "epoch": 4.69,
+      "grad_norm": 0.9392043352127075,
+      "learning_rate": 3.1506849315068495e-6,
+      "loss": 0.1869,
+      "step": 17600
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 0.8135913014411926,
+      "learning_rate": 2.8767123287671236e-6,
+      "loss": 0.1861,
+      "step": 17700
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 0.7956686615943909,
+      "learning_rate": 2.6027397260273973e-6,
+      "loss": 0.1864,
+      "step": 17800
+    },
+    {
+      "epoch": 4.77,
+      "grad_norm": 0.8956461548805237,
+      "learning_rate": 2.3287671232876713e-6,
+      "loss": 0.1889,
+      "step": 17900
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.9515472054481506,
+      "learning_rate": 2.054794520547945e-6,
+      "loss": 0.1871,
+      "step": 18000
+    },
+    {
+      "epoch": 4.83,
+      "grad_norm": 0.8886680006980896,
+      "learning_rate": 1.7808219178082193e-6,
+      "loss": 0.187,
+      "step": 18100
+    },
+    {
+      "epoch": 4.85,
+      "grad_norm": 0.8525242805480957,
+      "learning_rate": 1.5068493150684932e-6,
+      "loss": 0.1832,
+      "step": 18200
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 0.9522444009780884,
+      "learning_rate": 1.232876712328767e-6,
+      "loss": 0.186,
+      "step": 18300
+    },
+    {
+      "epoch": 4.91,
+      "grad_norm": 0.8611086010932922,
+      "learning_rate": 9.589041095890411e-7,
+      "loss": 0.1855,
+      "step": 18400
+    },
+    {
+      "epoch": 4.93,
+      "grad_norm": 0.9658819437026978,
+      "learning_rate": 6.849315068493151e-7,
+      "loss": 0.177,
+      "step": 18500
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 0.9198510646820068,
+      "learning_rate": 4.1095890410958903e-7,
+      "loss": 0.178,
+      "step": 18600
+    },
+    {
+      "epoch": 4.99,
+      "grad_norm": 0.8326091766357422,
+      "learning_rate": 1.36986301369863e-7,
+      "loss": 0.1865,
+      "step": 18700
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 18750,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 6250,
+  "total_flos": 8.12664225792e16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e10560e54c27682fe1af61856c09bdac5dfcb77c62ecca0494c7e6f4ba3762bd
+size 4920

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff