ntmkhanh commited on
Commit
747a654
1 Parent(s): 3d71e73

Upload 22 files

Browse files
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ model-index:
5
+ - name: eval_bartpho_final
6
+ results: []
7
+ ---
8
+
9
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
10
+ should probably proofread and complete it, then remove this comment. -->
11
+
12
+ # eval_bartpho_final
13
+
14
+ This model is a fine-tuned version of [vinai/bartpho-word-base](https://huggingface.co/vinai/bartpho-word-base) on an unknown dataset.
15
+
16
+ ## Model description
17
+
18
+ More information needed
19
+
20
+ ## Intended uses & limitations
21
+
22
+ More information needed
23
+
24
+ ## Training and evaluation data
25
+
26
+ More information needed
27
+
28
+ ## Training procedure
29
+
30
+ ### Training hyperparameters
31
+
32
+ The following hyperparameters were used during training:
33
+ - learning_rate: 0.0001
34
+ - train_batch_size: 4
35
+ - eval_batch_size: 4
36
+ - seed: 42
37
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
38
+ - lr_scheduler_type: linear
39
+ - lr_scheduler_warmup_steps: 20000
40
+ - num_epochs: 5.0
41
+
42
+ ### Training results
43
+
44
+
45
+
46
+ ### Framework versions
47
+
48
+ - Transformers 4.24.0
49
+ - Pytorch 2.1.0+cu118
50
+ - Datasets 2.15.0
51
+ - Tokenizers 0.13.3
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 64000
3
+ }
all_results.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "predict_gen_len": 134.9068783068783,
4
+ "predict_loss": 1.6737200021743774,
5
+ "predict_rouge1": 70.1854,
6
+ "predict_rouge2": 35.9507,
7
+ "predict_rougeL": 39.9199,
8
+ "predict_rougeLsum": 65.0469,
9
+ "predict_runtime": 933.4427,
10
+ "predict_samples": 1890,
11
+ "predict_samples_per_second": 2.025,
12
+ "predict_steps_per_second": 0.507,
13
+ "train_loss": 1.9915461536297583,
14
+ "train_runtime": 6040.9811,
15
+ "train_samples": 24300,
16
+ "train_samples_per_second": 20.113,
17
+ "train_steps_per_second": 5.028
18
+ }
bpe.codes ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vinai/bartpho-word-base",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "MBartForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 768,
12
+ "decoder_attention_heads": 12,
13
+ "decoder_ffn_dim": 3072,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 2,
17
+ "dropout": 0.1,
18
+ "encoder_attention_heads": 12,
19
+ "encoder_ffn_dim": 3072,
20
+ "encoder_layerdrop": 0.0,
21
+ "encoder_layers": 6,
22
+ "eos_token_id": 2,
23
+ "forced_eos_token_id": 2,
24
+ "gradient_checkpointing": false,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_position_embeddings": 1024,
28
+ "model_type": "mbart",
29
+ "num_hidden_layers": 6,
30
+ "pad_token_id": 1,
31
+ "scale_embedding": false,
32
+ "tokenizer_class": "PhobertTokenizer",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.24.0",
35
+ "use_cache": true,
36
+ "vocab_size": 64001
37
+ }
config.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /content/drive/MyDrive/LuanVan/Bart-BRIO/brio_project-main/BRIO/main.py
2
+ Namespace(cuda=True, gpuid=[0], evaluate=False, do_reranking=False, do_generation=False, log=True, port=12355, model_pt='', config='', batch_size=1, epoch=1, report_freq=100, accumulate_step=8, margin=0.001, gold_margin=0, gold_weight=0, mle_weight=0.1, rank_weight=10, model_type='vinai/bartpho-word-base', warmup_steps=10000, normalize=True, grad_norm=0, seed=970903, no_gold=False, pretrained='./finetuned_model_v3/eval_bartpho_final', max_lr=0.002, scale=0.5, score_mode='log', datatype='diverse', dataset='cooking_bart', max_len=120, max_num=6, smooth=0.01, total_len=1024, length_penalty=2.0, do_sample=True, gen_max_len=140, gen_min_len=55, is_pegasus=False, adding=0, eval_interval=1000, num_beams=6)
3
+
4
+ BRIO(
5
+ (model): MBartScorer(
6
+ (model): CustomMBartModel(
7
+ (shared): Embedding(64001, 768, padding_idx=1)
8
+ (encoder): MBartEncoder(
9
+ (embed_tokens): Embedding(64001, 768, padding_idx=1)
10
+ (embed_positions): MBartLearnedPositionalEmbedding(1026, 768)
11
+ (layers): ModuleList(
12
+ (0-5): 6 x MBartEncoderLayer(
13
+ (self_attn): MBartAttention(
14
+ (k_proj): Linear(in_features=768, out_features=768, bias=True)
15
+ (v_proj): Linear(in_features=768, out_features=768, bias=True)
16
+ (q_proj): Linear(in_features=768, out_features=768, bias=True)
17
+ (out_proj): Linear(in_features=768, out_features=768, bias=True)
18
+ )
19
+ (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
20
+ (activation_fn): GELUActivation()
21
+ (fc1): Linear(in_features=768, out_features=3072, bias=True)
22
+ (fc2): Linear(in_features=3072, out_features=768, bias=True)
23
+ (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
24
+ )
25
+ )
26
+ (layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
27
+ (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
28
+ )
29
+ (decoder): MBartDecoder(
30
+ (embed_tokens): Embedding(64001, 768, padding_idx=1)
31
+ (embed_positions): MBartLearnedPositionalEmbedding(1026, 768)
32
+ (layers): ModuleList(
33
+ (0-5): 6 x MBartDecoderLayer(
34
+ (self_attn): MBartAttention(
35
+ (k_proj): Linear(in_features=768, out_features=768, bias=True)
36
+ (v_proj): Linear(in_features=768, out_features=768, bias=True)
37
+ (q_proj): Linear(in_features=768, out_features=768, bias=True)
38
+ (out_proj): Linear(in_features=768, out_features=768, bias=True)
39
+ )
40
+ (activation_fn): GELUActivation()
41
+ (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
42
+ (encoder_attn): MBartAttention(
43
+ (k_proj): Linear(in_features=768, out_features=768, bias=True)
44
+ (v_proj): Linear(in_features=768, out_features=768, bias=True)
45
+ (q_proj): Linear(in_features=768, out_features=768, bias=True)
46
+ (out_proj): Linear(in_features=768, out_features=768, bias=True)
47
+ )
48
+ (encoder_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
49
+ (fc1): Linear(in_features=768, out_features=3072, bias=True)
50
+ (fc2): Linear(in_features=3072, out_features=768, bias=True)
51
+ (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
52
+ )
53
+ )
54
+ (layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
55
+ (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
56
+ )
57
+ )
58
+ (lm_head): Linear(in_features=768, out_features=64001, bias=False)
59
+ )
60
+ )
61
+
events.out.tfevents.1700387487.27f5c4c183d3.6020.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4aca334338936d669103a54136c40f8fd42fdb5d0af9b1bb71427d32e6be207
3
+ size 40
generated_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log.txt ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch: 1, batch: 100, avg loss: 1.035541, avg ranking loss: 0.016568, avg mle loss: 8.698632
2
+ learning rate: 0.000000
3
+
4
+ epoch: 1, batch: 200, avg loss: 0.939091, avg ranking loss: 0.015825, avg mle loss: 7.808369
5
+ learning rate: 0.000000
6
+
7
+ epoch: 1, batch: 300, avg loss: 0.786419, avg ranking loss: 0.015178, avg mle loss: 6.346376
8
+ learning rate: 0.000001
9
+
10
+ epoch: 1, batch: 400, avg loss: 0.653152, avg ranking loss: 0.014788, avg mle loss: 5.052757
11
+ learning rate: 0.000001
12
+
13
+ epoch: 1, batch: 500, avg loss: 0.583150, avg ranking loss: 0.014760, avg mle loss: 4.355531
14
+ learning rate: 0.000001
15
+
16
+ epoch: 1, batch: 600, avg loss: 0.531894, avg ranking loss: 0.014850, avg mle loss: 3.833926
17
+ learning rate: 0.000001
18
+
19
+ epoch: 1, batch: 700, avg loss: 0.496518, avg ranking loss: 0.014805, avg mle loss: 3.484637
20
+ learning rate: 0.000001
21
+
22
+ epoch: 1, batch: 800, avg loss: 0.470519, avg ranking loss: 0.014618, avg mle loss: 3.243440
23
+ learning rate: 0.000002
24
+
25
+ epoch: 1, batch: 900, avg loss: 0.449544, avg ranking loss: 0.014501, avg mle loss: 3.045354
26
+ learning rate: 0.000002
27
+
28
+ epoch: 1, batch: 1000, avg loss: 0.431474, avg ranking loss: 0.014440, avg mle loss: 2.870745
29
+ learning rate: 0.000002
30
+
31
+ best ranking loss - epoch: 0, batch: 999
32
+ val ranking loss: 0.729979
33
+ val ranking rouge1: 0.629180, rouge2: 0.334733, rougeLsum: 0.599455
34
+ best generation loss - epoch: 0, batch: 999
35
+ val generation loss: 0.885783
36
+ val generation rouge1: 0.321491, rouge2: 0.134002, rougeLsum: 0.299572
37
+ epoch: 1, batch: 1100, avg loss: 0.419558, avg ranking loss: 0.014534, avg mle loss: 2.742168
38
+ learning rate: 0.000002
39
+
40
+ epoch: 1, batch: 1200, avg loss: 0.405170, avg ranking loss: 0.014500, avg mle loss: 2.601675
41
+ learning rate: 0.000002
42
+
43
+ epoch: 1, batch: 1300, avg loss: 0.394226, avg ranking loss: 0.014297, avg mle loss: 2.512567
44
+ learning rate: 0.000003
45
+
46
+ epoch: 1, batch: 1400, avg loss: 0.388203, avg ranking loss: 0.014295, avg mle loss: 2.452513
47
+ learning rate: 0.000003
48
+
49
+ epoch: 1, batch: 1500, avg loss: 0.379525, avg ranking loss: 0.014269, avg mle loss: 2.368302
50
+ learning rate: 0.000003
51
+
52
+ epoch: 1, batch: 1600, avg loss: 0.375064, avg ranking loss: 0.014436, avg mle loss: 2.307035
53
+ learning rate: 0.000003
54
+
55
+ epoch: 1, batch: 1700, avg loss: 0.364328, avg ranking loss: 0.014073, avg mle loss: 2.235932
56
+ learning rate: 0.000003
57
+
58
+ epoch: 1, batch: 1800, avg loss: 0.362203, avg ranking loss: 0.014417, avg mle loss: 2.180354
59
+ learning rate: 0.000004
60
+
61
+ epoch: 1, batch: 1900, avg loss: 0.358076, avg ranking loss: 0.014373, avg mle loss: 2.143419
62
+ learning rate: 0.000004
63
+
64
+ epoch: 1, batch: 2000, avg loss: 0.351821, avg ranking loss: 0.014282, avg mle loss: 2.089963
65
+ learning rate: 0.000004
66
+
67
+ val ranking loss: 0.734460
68
+ val ranking rouge1: 0.620741, rouge2: 0.331641, rougeLsum: 0.590756
69
+ val generation loss: 0.885808
70
+ val generation rouge1: 0.314677, rouge2: 0.155852, rougeLsum: 0.293533
71
+ epoch: 1, batch: 2100, avg loss: 0.349724, avg ranking loss: 0.014392, avg mle loss: 2.058075
72
+ learning rate: 0.000004
73
+
74
+ epoch: 1, batch: 2200, avg loss: 0.343959, avg ranking loss: 0.014182, avg mle loss: 2.021375
75
+ learning rate: 0.000004
76
+
77
+ epoch: 1, batch: 2300, avg loss: 0.340133, avg ranking loss: 0.014330, avg mle loss: 1.968376
78
+ learning rate: 0.000005
79
+
80
+ epoch: 1, batch: 2400, avg loss: 0.336417, avg ranking loss: 0.014204, avg mle loss: 1.943732
81
+ learning rate: 0.000005
82
+
83
+ epoch: 1, batch: 2500, avg loss: 0.330869, avg ranking loss: 0.014164, avg mle loss: 1.892253
84
+ learning rate: 0.000005
85
+
86
+ epoch: 1, batch: 2600, avg loss: 0.329880, avg ranking loss: 0.014310, avg mle loss: 1.867807
87
+ learning rate: 0.000005
88
+
89
+ epoch: 1, batch: 2700, avg loss: 0.326743, avg ranking loss: 0.014328, avg mle loss: 1.834671
90
+ learning rate: 0.000005
91
+
92
+ epoch: 1, batch: 2800, avg loss: 0.323882, avg ranking loss: 0.014190, avg mle loss: 1.819801
93
+ learning rate: 0.000006
94
+
95
+ epoch: 1, batch: 2900, avg loss: 0.320222, avg ranking loss: 0.013906, avg mle loss: 1.811585
96
+ learning rate: 0.000006
97
+
98
+ epoch: 1, batch: 3000, avg loss: 0.317352, avg ranking loss: 0.014259, avg mle loss: 1.747636
99
+ learning rate: 0.000006
100
+
101
+ val ranking loss: 0.736030
102
+ val ranking rouge1: 0.617348, rouge2: 0.330589, rougeLsum: 0.587821
103
+ best generation loss - epoch: 0, batch: 2999
104
+ val generation loss: 0.884436
105
+ val generation rouge1: 0.314360, rouge2: 0.161066, rougeLsum: 0.296060
model_cur.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3384fc78d9ca93241399257e4ba6e5153d2a2d47c68bd6bc30c62f5f469398dd
3
+ size 600249722
model_ranking.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12bba296c12cfeaefbd8edd6ea05439d14a8f9757c391f60f417c8c9bdc58fa7
3
+ size 600250794
optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f08f48bebcdeee3b8d8ef40a70c7c88616604b45693c95c927ce1b892da2a5a2
3
+ size 1875706
predict_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "predict_gen_len": 134.9068783068783,
3
+ "predict_loss": 1.6737200021743774,
4
+ "predict_rouge1": 70.1854,
5
+ "predict_rouge2": 35.9507,
6
+ "predict_rougeL": 39.9199,
7
+ "predict_rougeLsum": 65.0469,
8
+ "predict_runtime": 933.4427,
9
+ "predict_samples": 1890,
10
+ "predict_samples_per_second": 2.025,
11
+ "predict_steps_per_second": 0.507
12
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b3b0a626a7f64c95b06b581a13cdc6839aecc1c71959fa61a9df75ede51a70
3
+ size 600251598
runs/Nov17_14-39-17_5ec2d2c8288f/1700232016.418216/events.out.tfevents.1700232016.5ec2d2c8288f.1764.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0e0a620d7ca1ec33e2b8cd36da8e56c1c37c5b2c68fe66b5582b4c4b480980
3
+ size 6000
runs/Nov17_14-39-17_5ec2d2c8288f/events.out.tfevents.1700232016.5ec2d2c8288f.1764.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92726d32fd3d70cee2b86772ba391ba1804a297dbad24e7a35f34b7eb27f2b76
3
+ size 14283
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "name_or_path": "vinai/bartpho-word-base",
7
+ "pad_token": "<pad>",
8
+ "sep_token": "</s>",
9
+ "special_tokens_map_file": null,
10
+ "tokenizer_class": "PhobertTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.9915461536297583,
4
+ "train_runtime": 6040.9811,
5
+ "train_samples": 24300,
6
+ "train_samples_per_second": 20.113,
7
+ "train_steps_per_second": 5.028
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "global_step": 30375,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.08,
12
+ "learning_rate": 2.5e-06,
13
+ "loss": 4.9897,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.16,
18
+ "learning_rate": 5e-06,
19
+ "loss": 3.4867,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.25,
24
+ "learning_rate": 7.5e-06,
25
+ "loss": 3.0749,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.33,
30
+ "learning_rate": 1e-05,
31
+ "loss": 2.8423,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.41,
36
+ "learning_rate": 1.25e-05,
37
+ "loss": 2.6817,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.49,
42
+ "learning_rate": 1.5e-05,
43
+ "loss": 2.542,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.58,
48
+ "learning_rate": 1.75e-05,
49
+ "loss": 2.4682,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 0.66,
54
+ "learning_rate": 2e-05,
55
+ "loss": 2.3818,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 0.74,
60
+ "learning_rate": 2.25e-05,
61
+ "loss": 2.3417,
62
+ "step": 4500
63
+ },
64
+ {
65
+ "epoch": 0.82,
66
+ "learning_rate": 2.5e-05,
67
+ "loss": 2.2735,
68
+ "step": 5000
69
+ },
70
+ {
71
+ "epoch": 0.91,
72
+ "learning_rate": 2.7500000000000004e-05,
73
+ "loss": 2.2405,
74
+ "step": 5500
75
+ },
76
+ {
77
+ "epoch": 0.99,
78
+ "learning_rate": 3e-05,
79
+ "loss": 2.2079,
80
+ "step": 6000
81
+ },
82
+ {
83
+ "epoch": 1.07,
84
+ "learning_rate": 3.2500000000000004e-05,
85
+ "loss": 2.1451,
86
+ "step": 6500
87
+ },
88
+ {
89
+ "epoch": 1.15,
90
+ "learning_rate": 3.5e-05,
91
+ "loss": 2.126,
92
+ "step": 7000
93
+ },
94
+ {
95
+ "epoch": 1.23,
96
+ "learning_rate": 3.7500000000000003e-05,
97
+ "loss": 2.1235,
98
+ "step": 7500
99
+ },
100
+ {
101
+ "epoch": 1.32,
102
+ "learning_rate": 4e-05,
103
+ "loss": 2.094,
104
+ "step": 8000
105
+ },
106
+ {
107
+ "epoch": 1.4,
108
+ "learning_rate": 4.25e-05,
109
+ "loss": 2.06,
110
+ "step": 8500
111
+ },
112
+ {
113
+ "epoch": 1.48,
114
+ "learning_rate": 4.5e-05,
115
+ "loss": 2.0661,
116
+ "step": 9000
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "learning_rate": 4.75e-05,
121
+ "loss": 2.0378,
122
+ "step": 9500
123
+ },
124
+ {
125
+ "epoch": 1.65,
126
+ "learning_rate": 5e-05,
127
+ "loss": 2.0357,
128
+ "step": 10000
129
+ },
130
+ {
131
+ "epoch": 1.73,
132
+ "learning_rate": 5.25e-05,
133
+ "loss": 2.0264,
134
+ "step": 10500
135
+ },
136
+ {
137
+ "epoch": 1.81,
138
+ "learning_rate": 5.500000000000001e-05,
139
+ "loss": 2.0025,
140
+ "step": 11000
141
+ },
142
+ {
143
+ "epoch": 1.89,
144
+ "learning_rate": 5.7499999999999995e-05,
145
+ "loss": 1.9848,
146
+ "step": 11500
147
+ },
148
+ {
149
+ "epoch": 1.98,
150
+ "learning_rate": 6e-05,
151
+ "loss": 1.9737,
152
+ "step": 12000
153
+ },
154
+ {
155
+ "epoch": 2.06,
156
+ "learning_rate": 6.25e-05,
157
+ "loss": 1.9221,
158
+ "step": 12500
159
+ },
160
+ {
161
+ "epoch": 2.14,
162
+ "learning_rate": 6.500000000000001e-05,
163
+ "loss": 1.8885,
164
+ "step": 13000
165
+ },
166
+ {
167
+ "epoch": 2.22,
168
+ "learning_rate": 6.750000000000001e-05,
169
+ "loss": 1.8812,
170
+ "step": 13500
171
+ },
172
+ {
173
+ "epoch": 2.3,
174
+ "learning_rate": 7e-05,
175
+ "loss": 1.8832,
176
+ "step": 14000
177
+ },
178
+ {
179
+ "epoch": 2.39,
180
+ "learning_rate": 7.25e-05,
181
+ "loss": 1.8955,
182
+ "step": 14500
183
+ },
184
+ {
185
+ "epoch": 2.47,
186
+ "learning_rate": 7.500000000000001e-05,
187
+ "loss": 1.8907,
188
+ "step": 15000
189
+ },
190
+ {
191
+ "epoch": 2.55,
192
+ "learning_rate": 7.75e-05,
193
+ "loss": 1.8945,
194
+ "step": 15500
195
+ },
196
+ {
197
+ "epoch": 2.63,
198
+ "learning_rate": 8e-05,
199
+ "loss": 1.8805,
200
+ "step": 16000
201
+ },
202
+ {
203
+ "epoch": 2.72,
204
+ "learning_rate": 8.25e-05,
205
+ "loss": 1.891,
206
+ "step": 16500
207
+ },
208
+ {
209
+ "epoch": 2.8,
210
+ "learning_rate": 8.5e-05,
211
+ "loss": 1.8689,
212
+ "step": 17000
213
+ },
214
+ {
215
+ "epoch": 2.88,
216
+ "learning_rate": 8.75e-05,
217
+ "loss": 1.8594,
218
+ "step": 17500
219
+ },
220
+ {
221
+ "epoch": 2.96,
222
+ "learning_rate": 9e-05,
223
+ "loss": 1.8787,
224
+ "step": 18000
225
+ },
226
+ {
227
+ "epoch": 3.05,
228
+ "learning_rate": 9.250000000000001e-05,
229
+ "loss": 1.8105,
230
+ "step": 18500
231
+ },
232
+ {
233
+ "epoch": 3.13,
234
+ "learning_rate": 9.5e-05,
235
+ "loss": 1.7637,
236
+ "step": 19000
237
+ },
238
+ {
239
+ "epoch": 3.21,
240
+ "learning_rate": 9.75e-05,
241
+ "loss": 1.7573,
242
+ "step": 19500
243
+ },
244
+ {
245
+ "epoch": 3.29,
246
+ "learning_rate": 0.0001,
247
+ "loss": 1.7688,
248
+ "step": 20000
249
+ },
250
+ {
251
+ "epoch": 3.37,
252
+ "learning_rate": 9.518072289156626e-05,
253
+ "loss": 1.7749,
254
+ "step": 20500
255
+ },
256
+ {
257
+ "epoch": 3.46,
258
+ "learning_rate": 9.036144578313253e-05,
259
+ "loss": 1.7637,
260
+ "step": 21000
261
+ },
262
+ {
263
+ "epoch": 3.54,
264
+ "learning_rate": 8.55421686746988e-05,
265
+ "loss": 1.7544,
266
+ "step": 21500
267
+ },
268
+ {
269
+ "epoch": 3.62,
270
+ "learning_rate": 8.072289156626507e-05,
271
+ "loss": 1.7499,
272
+ "step": 22000
273
+ },
274
+ {
275
+ "epoch": 3.7,
276
+ "learning_rate": 7.590361445783133e-05,
277
+ "loss": 1.7189,
278
+ "step": 22500
279
+ },
280
+ {
281
+ "epoch": 3.79,
282
+ "learning_rate": 7.108433734939759e-05,
283
+ "loss": 1.7285,
284
+ "step": 23000
285
+ },
286
+ {
287
+ "epoch": 3.87,
288
+ "learning_rate": 6.626506024096386e-05,
289
+ "loss": 1.7168,
290
+ "step": 23500
291
+ },
292
+ {
293
+ "epoch": 3.95,
294
+ "learning_rate": 6.144578313253012e-05,
295
+ "loss": 1.6973,
296
+ "step": 24000
297
+ },
298
+ {
299
+ "epoch": 4.03,
300
+ "learning_rate": 5.6626506024096394e-05,
301
+ "loss": 1.6223,
302
+ "step": 24500
303
+ },
304
+ {
305
+ "epoch": 4.12,
306
+ "learning_rate": 5.180722891566265e-05,
307
+ "loss": 1.5353,
308
+ "step": 25000
309
+ },
310
+ {
311
+ "epoch": 4.2,
312
+ "learning_rate": 4.698795180722892e-05,
313
+ "loss": 1.4975,
314
+ "step": 25500
315
+ },
316
+ {
317
+ "epoch": 4.28,
318
+ "learning_rate": 4.2168674698795186e-05,
319
+ "loss": 1.5138,
320
+ "step": 26000
321
+ },
322
+ {
323
+ "epoch": 4.36,
324
+ "learning_rate": 3.734939759036144e-05,
325
+ "loss": 1.5093,
326
+ "step": 26500
327
+ },
328
+ {
329
+ "epoch": 4.44,
330
+ "learning_rate": 3.253012048192771e-05,
331
+ "loss": 1.5031,
332
+ "step": 27000
333
+ },
334
+ {
335
+ "epoch": 4.53,
336
+ "learning_rate": 2.7710843373493977e-05,
337
+ "loss": 1.4948,
338
+ "step": 27500
339
+ },
340
+ {
341
+ "epoch": 4.61,
342
+ "learning_rate": 2.289156626506024e-05,
343
+ "loss": 1.4928,
344
+ "step": 28000
345
+ },
346
+ {
347
+ "epoch": 4.69,
348
+ "learning_rate": 1.8072289156626505e-05,
349
+ "loss": 1.4922,
350
+ "step": 28500
351
+ },
352
+ {
353
+ "epoch": 4.77,
354
+ "learning_rate": 1.3253012048192772e-05,
355
+ "loss": 1.4596,
356
+ "step": 29000
357
+ },
358
+ {
359
+ "epoch": 4.86,
360
+ "learning_rate": 8.433734939759036e-06,
361
+ "loss": 1.4649,
362
+ "step": 29500
363
+ },
364
+ {
365
+ "epoch": 4.94,
366
+ "learning_rate": 3.614457831325301e-06,
367
+ "loss": 1.4643,
368
+ "step": 30000
369
+ },
370
+ {
371
+ "epoch": 5.0,
372
+ "step": 30375,
373
+ "total_flos": 5570927176531968.0,
374
+ "train_loss": 1.9915461536297583,
375
+ "train_runtime": 6040.9811,
376
+ "train_samples_per_second": 20.113,
377
+ "train_steps_per_second": 5.028
378
+ }
379
+ ],
380
+ "max_steps": 30375,
381
+ "num_train_epochs": 5,
382
+ "total_flos": 5570927176531968.0,
383
+ "trial_name": null,
384
+ "trial_params": null
385
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e1b29470134c45b922406b8eb209f32249f2c703e6e53e25d8952db46eabfec
3
+ size 4152
vocab.txt ADDED
The diff for this file is too large to render. See raw diff