yuchenlin commited on
Commit
ae9af6a
1 Parent(s): a4a41ca

8000 of bart0-base

Browse files
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - bigscience/P3
4
+ language: en
5
+ license: apache-2.0
6
+ widget:
7
+ - text: "A is the son's of B's uncle. What is the family relationship between A and B?"
8
+ - text: "Reorder the words in this sentence: justin and name bieber years is my am I 27 old."
9
+ - text: "Task: copy but say the opposite.\n
10
+ PSG won its match against Barca."
11
+ - text: "Is this review positive or negative? Review: Best cast iron skillet you will every buy."
12
+ example_title: "Sentiment analysis"
13
+ - text: "Question A: How is air traffic controlled?
14
+ \nQuestion B: How do you become an air traffic controller?\nPick one: these questions are duplicates or not duplicates."
15
+ - text: "Barack Obama nominated Hilary Clinton as his secretary of state on Monday. He chose her because she had foreign affairs experience as a former First Lady.
16
+ \nIn the previous sentence, decide who 'her' is referring to."
17
+ example_title: "Coreference resolution"
18
+ - text: "Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\n
19
+ Select the category for the above sentence from: mobile, website, billing, account access."
20
+ - text: "Sentence 1: Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.\n
21
+ Sentence 2: The head of the local disaster unit, Gyorgy Heizler, said the bus was full except for 38 empty seats.\n\n
22
+ Do sentences 1 and 2 have the same meaning?"
23
+ example_title: "Paraphrase identification"
24
+ - text: "Here's the beginning of an article, choose a tag that best describes the topic of the article: business, cinema, politics, health, travel, sports.\n\n
25
+ The best and worst fo 007 as 'No time to die' marks Daniel Craig's exit.\n
26
+ (CNN) Some 007 math: 60 years, 25 movies (with a small asterisk) and six James Bonds. For a Cold War creation, Ian Fleming's suave spy has certainly gotten around, but despite different guises in the tuxedo and occasional scuba gear, when it comes to Bond ratings, there really shouldn't be much argument about who wore it best."
27
+ - text: "Max: Know any good websites to buy clothes from?\n
28
+ Payton: Sure :) LINK 1, LINK 2, LINK 3\n
29
+ Max: That's a lot of them!\n
30
+ Payton: Yeah, but they have different things so I usually buy things from 2 or 3 of them.\n
31
+ Max: I'll check them out. Thanks.\n\n
32
+ Who or what are Payton and Max referring to when they say 'them'?"
33
+ - text: "Is the word 'table' used in the same meaning in the two following sentences?\n\n
34
+ Sentence A: you can leave the books on the table over there.\n
35
+ Sentence B: the tables in this book are very hard to read."
36
+ - text: "On a shelf, there are five books: a gray book, a red book, a purple book, a blue book, and a black book.\n
37
+ The red book is to the right of the gray book. The black book is to the left of the blue book. The blue book is to the left of the gray book. The purple book is the second from the right.\n\n
38
+ Which book is the leftmost book?"
39
+ example_title: "Logic puzzles"
40
+ - text: "The two men running to become New York City's next mayor will face off in their first debate Wednesday night.\n\n
41
+ Democrat Eric Adams, the Brooklyn Borough president and a former New York City police captain, is widely expected to win the Nov. 2 election against Republican Curtis Sliwa, the founder of the 1970s-era Guardian Angels anti-crime patril.\n\n
42
+ Who are the men running for mayor?"
43
+ example_title: "Reading comprehension"
44
+ - text: "The word 'binne' means any animal that is furry and has four legs, and the word 'bam' means a simple sort of dwelling.\n\n
45
+ Which of the following best characterizes binne bams?\n
46
+ - Sentence 1: Binne bams are for pets.\n
47
+ - Sentence 2: Binne bams are typically furnished with sofas and televisions.\n
48
+ - Sentence 3: Binne bams are luxurious apartments.\n
49
+ - Sentence 4: Binne bams are places where people live."
50
+ ---
51
+
52
+ TBA
config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 2,
27
+ "forced_eos_token_id": 2,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_position_embeddings": 1024,
42
+ "model_type": "bart",
43
+ "no_repeat_ngram_size": 3,
44
+ "normalize_before": false,
45
+ "normalize_embedding": true,
46
+ "num_beams": 4,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 1,
49
+ "scale_embedding": false,
50
+ "task_specific_params": {
51
+ "summarization": {
52
+ "length_penalty": 1.0,
53
+ "max_length": 128,
54
+ "min_length": 12,
55
+ "num_beams": 4
56
+ },
57
+ "summarization_cnn": {
58
+ "length_penalty": 2.0,
59
+ "max_length": 142,
60
+ "min_length": 56,
61
+ "num_beams": 4
62
+ },
63
+ "summarization_xsum": {
64
+ "length_penalty": 1.0,
65
+ "max_length": 62,
66
+ "min_length": 11,
67
+ "num_beams": 6
68
+ }
69
+ },
70
+ "torch_dtype": "float32",
71
+ "transformers_version": "4.12.5",
72
+ "use_cache": true,
73
+ "vocab_size": 50265
74
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ec55770ac5772dacf707b630ddaa101a4a52d15c235ab4df1c0943d0ff2a0ec
3
+ size 1115513717
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35bbfa70c58a9375d1eb22db049f8f84853dd7893937fb6d3201c63d54dedc6a
3
+ size 557979193
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5291e3ed381afc496effc538f087f3918e3fe5352163cc27cefd75c0c948491e
3
+ size 21515
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94018654b6c4468ad79069048e581fbbf5cc00e7f69c1bde948db425356d0796
3
+ size 559
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd0ce319bb038aea3f3bf4993c8f9cb14dc0332875c74964501c4fabb01a1ecd
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "facebook/bart-base", "tokenizer_class": "BartTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,1616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.026070237159729,
3
+ "best_model_checkpoint": "./outputs/BART0-base/checkpoint-2000",
4
+ "epoch": 1.3402294999581206,
5
+ "global_step": 8000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 4.979477299380131e-05,
13
+ "loss": 3.1968,
14
+ "step": 50
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 4.958535768135366e-05,
19
+ "loss": 2.5802,
20
+ "step": 100
21
+ },
22
+ {
23
+ "epoch": 0.02,
24
+ "eval_loss": 1.012151837348938,
25
+ "eval_runtime": 15.5603,
26
+ "eval_samples_per_second": 597.674,
27
+ "eval_steps_per_second": 2.378,
28
+ "step": 100
29
+ },
30
+ {
31
+ "epoch": 0.03,
32
+ "learning_rate": 4.9375942368906014e-05,
33
+ "loss": 2.4448,
34
+ "step": 150
35
+ },
36
+ {
37
+ "epoch": 0.03,
38
+ "learning_rate": 4.916652705645837e-05,
39
+ "loss": 2.3684,
40
+ "step": 200
41
+ },
42
+ {
43
+ "epoch": 0.03,
44
+ "eval_loss": 1.0450046062469482,
45
+ "eval_runtime": 15.8481,
46
+ "eval_samples_per_second": 586.822,
47
+ "eval_steps_per_second": 2.335,
48
+ "step": 200
49
+ },
50
+ {
51
+ "epoch": 0.04,
52
+ "learning_rate": 4.8957111744010725e-05,
53
+ "loss": 2.3267,
54
+ "step": 250
55
+ },
56
+ {
57
+ "epoch": 0.05,
58
+ "learning_rate": 4.874769643156308e-05,
59
+ "loss": 2.2764,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.05,
64
+ "eval_loss": 1.0499160289764404,
65
+ "eval_runtime": 15.8218,
66
+ "eval_samples_per_second": 587.798,
67
+ "eval_steps_per_second": 2.339,
68
+ "step": 300
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 4.853828111911543e-05,
73
+ "loss": 2.2669,
74
+ "step": 350
75
+ },
76
+ {
77
+ "epoch": 0.07,
78
+ "learning_rate": 4.8328865806667784e-05,
79
+ "loss": 2.2265,
80
+ "step": 400
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "eval_loss": 1.0236656665802002,
85
+ "eval_runtime": 15.7197,
86
+ "eval_samples_per_second": 591.616,
87
+ "eval_steps_per_second": 2.354,
88
+ "step": 400
89
+ },
90
+ {
91
+ "epoch": 0.08,
92
+ "learning_rate": 4.811945049422014e-05,
93
+ "loss": 2.2294,
94
+ "step": 450
95
+ },
96
+ {
97
+ "epoch": 0.08,
98
+ "learning_rate": 4.7910035181772494e-05,
99
+ "loss": 2.1933,
100
+ "step": 500
101
+ },
102
+ {
103
+ "epoch": 0.08,
104
+ "eval_loss": 1.05411958694458,
105
+ "eval_runtime": 16.1405,
106
+ "eval_samples_per_second": 576.19,
107
+ "eval_steps_per_second": 2.292,
108
+ "step": 500
109
+ },
110
+ {
111
+ "epoch": 0.09,
112
+ "learning_rate": 4.770061986932485e-05,
113
+ "loss": 2.2091,
114
+ "step": 550
115
+ },
116
+ {
117
+ "epoch": 0.1,
118
+ "learning_rate": 4.74912045568772e-05,
119
+ "loss": 2.2114,
120
+ "step": 600
121
+ },
122
+ {
123
+ "epoch": 0.1,
124
+ "eval_loss": 1.0270859003067017,
125
+ "eval_runtime": 16.1225,
126
+ "eval_samples_per_second": 576.832,
127
+ "eval_steps_per_second": 2.295,
128
+ "step": 600
129
+ },
130
+ {
131
+ "epoch": 0.11,
132
+ "learning_rate": 4.728178924442955e-05,
133
+ "loss": 2.1721,
134
+ "step": 650
135
+ },
136
+ {
137
+ "epoch": 0.12,
138
+ "learning_rate": 4.707237393198191e-05,
139
+ "loss": 2.1518,
140
+ "step": 700
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "eval_loss": 1.0103075504302979,
145
+ "eval_runtime": 15.7699,
146
+ "eval_samples_per_second": 589.729,
147
+ "eval_steps_per_second": 2.346,
148
+ "step": 700
149
+ },
150
+ {
151
+ "epoch": 0.13,
152
+ "learning_rate": 4.6867146925783216e-05,
153
+ "loss": 2.1758,
154
+ "step": 750
155
+ },
156
+ {
157
+ "epoch": 0.13,
158
+ "learning_rate": 4.665773161333557e-05,
159
+ "loss": 2.0953,
160
+ "step": 800
161
+ },
162
+ {
163
+ "epoch": 0.13,
164
+ "eval_loss": 1.074766993522644,
165
+ "eval_runtime": 15.6532,
166
+ "eval_samples_per_second": 594.128,
167
+ "eval_steps_per_second": 2.364,
168
+ "step": 800
169
+ },
170
+ {
171
+ "epoch": 0.14,
172
+ "learning_rate": 4.644831630088792e-05,
173
+ "loss": 2.097,
174
+ "step": 850
175
+ },
176
+ {
177
+ "epoch": 0.15,
178
+ "learning_rate": 4.6238900988440275e-05,
179
+ "loss": 2.1284,
180
+ "step": 900
181
+ },
182
+ {
183
+ "epoch": 0.15,
184
+ "eval_loss": 1.0292476415634155,
185
+ "eval_runtime": 15.7987,
186
+ "eval_samples_per_second": 588.655,
187
+ "eval_steps_per_second": 2.342,
188
+ "step": 900
189
+ },
190
+ {
191
+ "epoch": 0.16,
192
+ "learning_rate": 4.602948567599263e-05,
193
+ "loss": 2.1503,
194
+ "step": 950
195
+ },
196
+ {
197
+ "epoch": 0.17,
198
+ "learning_rate": 4.5820070363544986e-05,
199
+ "loss": 2.1173,
200
+ "step": 1000
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "eval_loss": 1.0330989360809326,
205
+ "eval_runtime": 16.1941,
206
+ "eval_samples_per_second": 574.283,
207
+ "eval_steps_per_second": 2.285,
208
+ "step": 1000
209
+ },
210
+ {
211
+ "epoch": 0.18,
212
+ "learning_rate": 4.561065505109734e-05,
213
+ "loss": 2.064,
214
+ "step": 1050
215
+ },
216
+ {
217
+ "epoch": 0.18,
218
+ "learning_rate": 4.54096163511476e-05,
219
+ "loss": 2.1102,
220
+ "step": 1100
221
+ },
222
+ {
223
+ "epoch": 0.18,
224
+ "eval_loss": 1.0028032064437866,
225
+ "eval_runtime": 15.6866,
226
+ "eval_samples_per_second": 592.862,
227
+ "eval_steps_per_second": 2.359,
228
+ "step": 1100
229
+ },
230
+ {
231
+ "epoch": 0.19,
232
+ "learning_rate": 4.520020103869995e-05,
233
+ "loss": 2.0906,
234
+ "step": 1150
235
+ },
236
+ {
237
+ "epoch": 0.2,
238
+ "learning_rate": 4.4990785726252305e-05,
239
+ "loss": 2.0648,
240
+ "step": 1200
241
+ },
242
+ {
243
+ "epoch": 0.2,
244
+ "eval_loss": 1.0790895223617554,
245
+ "eval_runtime": 15.8769,
246
+ "eval_samples_per_second": 585.758,
247
+ "eval_steps_per_second": 2.33,
248
+ "step": 1200
249
+ },
250
+ {
251
+ "epoch": 0.21,
252
+ "learning_rate": 4.478137041380466e-05,
253
+ "loss": 2.0613,
254
+ "step": 1250
255
+ },
256
+ {
257
+ "epoch": 0.22,
258
+ "learning_rate": 4.4571955101357016e-05,
259
+ "loss": 2.0785,
260
+ "step": 1300
261
+ },
262
+ {
263
+ "epoch": 0.22,
264
+ "eval_loss": 1.0282424688339233,
265
+ "eval_runtime": 15.9134,
266
+ "eval_samples_per_second": 584.414,
267
+ "eval_steps_per_second": 2.325,
268
+ "step": 1300
269
+ },
270
+ {
271
+ "epoch": 0.23,
272
+ "learning_rate": 4.436253978890937e-05,
273
+ "loss": 2.0786,
274
+ "step": 1350
275
+ },
276
+ {
277
+ "epoch": 0.23,
278
+ "learning_rate": 4.415312447646172e-05,
279
+ "loss": 2.0748,
280
+ "step": 1400
281
+ },
282
+ {
283
+ "epoch": 0.23,
284
+ "eval_loss": 1.0269650220870972,
285
+ "eval_runtime": 16.1388,
286
+ "eval_samples_per_second": 576.252,
287
+ "eval_steps_per_second": 2.293,
288
+ "step": 1400
289
+ },
290
+ {
291
+ "epoch": 0.24,
292
+ "learning_rate": 4.3943709164014075e-05,
293
+ "loss": 2.05,
294
+ "step": 1450
295
+ },
296
+ {
297
+ "epoch": 0.25,
298
+ "learning_rate": 4.373429385156643e-05,
299
+ "loss": 2.0797,
300
+ "step": 1500
301
+ },
302
+ {
303
+ "epoch": 0.25,
304
+ "eval_loss": 1.0267333984375,
305
+ "eval_runtime": 16.2393,
306
+ "eval_samples_per_second": 572.684,
307
+ "eval_steps_per_second": 2.278,
308
+ "step": 1500
309
+ },
310
+ {
311
+ "epoch": 0.26,
312
+ "learning_rate": 4.3524878539118785e-05,
313
+ "loss": 2.0736,
314
+ "step": 1550
315
+ },
316
+ {
317
+ "epoch": 0.27,
318
+ "learning_rate": 4.331546322667114e-05,
319
+ "loss": 2.0734,
320
+ "step": 1600
321
+ },
322
+ {
323
+ "epoch": 0.27,
324
+ "eval_loss": 1.0247018337249756,
325
+ "eval_runtime": 15.6292,
326
+ "eval_samples_per_second": 595.039,
327
+ "eval_steps_per_second": 2.367,
328
+ "step": 1600
329
+ },
330
+ {
331
+ "epoch": 0.28,
332
+ "learning_rate": 4.310604791422349e-05,
333
+ "loss": 2.0417,
334
+ "step": 1650
335
+ },
336
+ {
337
+ "epoch": 0.28,
338
+ "learning_rate": 4.2896632601775844e-05,
339
+ "loss": 2.0529,
340
+ "step": 1700
341
+ },
342
+ {
343
+ "epoch": 0.28,
344
+ "eval_loss": 1.0378021001815796,
345
+ "eval_runtime": 16.0607,
346
+ "eval_samples_per_second": 579.055,
347
+ "eval_steps_per_second": 2.304,
348
+ "step": 1700
349
+ },
350
+ {
351
+ "epoch": 0.29,
352
+ "learning_rate": 4.26872172893282e-05,
353
+ "loss": 2.0452,
354
+ "step": 1750
355
+ },
356
+ {
357
+ "epoch": 0.3,
358
+ "learning_rate": 4.247780197688055e-05,
359
+ "loss": 2.0267,
360
+ "step": 1800
361
+ },
362
+ {
363
+ "epoch": 0.3,
364
+ "eval_loss": 1.0441969633102417,
365
+ "eval_runtime": 16.1723,
366
+ "eval_samples_per_second": 575.058,
367
+ "eval_steps_per_second": 2.288,
368
+ "step": 1800
369
+ },
370
+ {
371
+ "epoch": 0.31,
372
+ "learning_rate": 4.226838666443291e-05,
373
+ "loss": 2.0142,
374
+ "step": 1850
375
+ },
376
+ {
377
+ "epoch": 0.32,
378
+ "learning_rate": 4.205897135198526e-05,
379
+ "loss": 2.0266,
380
+ "step": 1900
381
+ },
382
+ {
383
+ "epoch": 0.32,
384
+ "eval_loss": 1.0151029825210571,
385
+ "eval_runtime": 16.1156,
386
+ "eval_samples_per_second": 577.082,
387
+ "eval_steps_per_second": 2.296,
388
+ "step": 1900
389
+ },
390
+ {
391
+ "epoch": 0.33,
392
+ "learning_rate": 4.1849556039537614e-05,
393
+ "loss": 2.0295,
394
+ "step": 1950
395
+ },
396
+ {
397
+ "epoch": 0.34,
398
+ "learning_rate": 4.164014072708997e-05,
399
+ "loss": 2.0069,
400
+ "step": 2000
401
+ },
402
+ {
403
+ "epoch": 0.34,
404
+ "eval_loss": 1.026070237159729,
405
+ "eval_runtime": 16.0479,
406
+ "eval_samples_per_second": 579.515,
407
+ "eval_steps_per_second": 2.306,
408
+ "step": 2000
409
+ },
410
+ {
411
+ "epoch": 0.34,
412
+ "learning_rate": 4.143072541464232e-05,
413
+ "loss": 2.0349,
414
+ "step": 2050
415
+ },
416
+ {
417
+ "epoch": 0.35,
418
+ "learning_rate": 4.122131010219468e-05,
419
+ "loss": 2.0565,
420
+ "step": 2100
421
+ },
422
+ {
423
+ "epoch": 0.35,
424
+ "eval_loss": 1.0249441862106323,
425
+ "eval_runtime": 16.4225,
426
+ "eval_samples_per_second": 566.297,
427
+ "eval_steps_per_second": 2.253,
428
+ "step": 2100
429
+ },
430
+ {
431
+ "epoch": 0.36,
432
+ "learning_rate": 4.101189478974703e-05,
433
+ "loss": 2.0196,
434
+ "step": 2150
435
+ },
436
+ {
437
+ "epoch": 0.37,
438
+ "learning_rate": 4.080247947729938e-05,
439
+ "loss": 2.0251,
440
+ "step": 2200
441
+ },
442
+ {
443
+ "epoch": 0.37,
444
+ "eval_loss": 1.0202720165252686,
445
+ "eval_runtime": 16.1636,
446
+ "eval_samples_per_second": 575.367,
447
+ "eval_steps_per_second": 2.289,
448
+ "step": 2200
449
+ },
450
+ {
451
+ "epoch": 0.38,
452
+ "learning_rate": 4.059306416485174e-05,
453
+ "loss": 2.0122,
454
+ "step": 2250
455
+ },
456
+ {
457
+ "epoch": 0.39,
458
+ "learning_rate": 4.038364885240409e-05,
459
+ "loss": 2.0046,
460
+ "step": 2300
461
+ },
462
+ {
463
+ "epoch": 0.39,
464
+ "eval_loss": 1.0397460460662842,
465
+ "eval_runtime": 16.2098,
466
+ "eval_samples_per_second": 573.726,
467
+ "eval_steps_per_second": 2.283,
468
+ "step": 2300
469
+ },
470
+ {
471
+ "epoch": 0.39,
472
+ "learning_rate": 4.017423353995645e-05,
473
+ "loss": 1.9894,
474
+ "step": 2350
475
+ },
476
+ {
477
+ "epoch": 0.4,
478
+ "learning_rate": 3.99648182275088e-05,
479
+ "loss": 1.9869,
480
+ "step": 2400
481
+ },
482
+ {
483
+ "epoch": 0.4,
484
+ "eval_loss": 1.05420982837677,
485
+ "eval_runtime": 16.4921,
486
+ "eval_samples_per_second": 563.906,
487
+ "eval_steps_per_second": 2.243,
488
+ "step": 2400
489
+ },
490
+ {
491
+ "epoch": 0.41,
492
+ "learning_rate": 3.975540291506115e-05,
493
+ "loss": 2.022,
494
+ "step": 2450
495
+ },
496
+ {
497
+ "epoch": 0.42,
498
+ "learning_rate": 3.954598760261351e-05,
499
+ "loss": 1.9956,
500
+ "step": 2500
501
+ },
502
+ {
503
+ "epoch": 0.42,
504
+ "eval_loss": 1.031363844871521,
505
+ "eval_runtime": 16.1444,
506
+ "eval_samples_per_second": 576.052,
507
+ "eval_steps_per_second": 2.292,
508
+ "step": 2500
509
+ },
510
+ {
511
+ "epoch": 0.43,
512
+ "learning_rate": 3.9336572290165856e-05,
513
+ "loss": 1.9659,
514
+ "step": 2550
515
+ },
516
+ {
517
+ "epoch": 0.44,
518
+ "learning_rate": 3.912715697771821e-05,
519
+ "loss": 2.0078,
520
+ "step": 2600
521
+ },
522
+ {
523
+ "epoch": 0.44,
524
+ "eval_loss": 1.0604513883590698,
525
+ "eval_runtime": 16.0399,
526
+ "eval_samples_per_second": 579.804,
527
+ "eval_steps_per_second": 2.307,
528
+ "step": 2600
529
+ },
530
+ {
531
+ "epoch": 0.44,
532
+ "learning_rate": 3.891774166527057e-05,
533
+ "loss": 1.9656,
534
+ "step": 2650
535
+ },
536
+ {
537
+ "epoch": 0.45,
538
+ "learning_rate": 3.870832635282292e-05,
539
+ "loss": 1.9977,
540
+ "step": 2700
541
+ },
542
+ {
543
+ "epoch": 0.45,
544
+ "eval_loss": 1.0340880155563354,
545
+ "eval_runtime": 16.1614,
546
+ "eval_samples_per_second": 575.444,
547
+ "eval_steps_per_second": 2.289,
548
+ "step": 2700
549
+ },
550
+ {
551
+ "epoch": 0.46,
552
+ "learning_rate": 3.849891104037528e-05,
553
+ "loss": 2.0003,
554
+ "step": 2750
555
+ },
556
+ {
557
+ "epoch": 0.47,
558
+ "learning_rate": 3.8289495727927626e-05,
559
+ "loss": 1.9757,
560
+ "step": 2800
561
+ },
562
+ {
563
+ "epoch": 0.47,
564
+ "eval_loss": 1.0368785858154297,
565
+ "eval_runtime": 16.3593,
566
+ "eval_samples_per_second": 568.482,
567
+ "eval_steps_per_second": 2.262,
568
+ "step": 2800
569
+ },
570
+ {
571
+ "epoch": 0.48,
572
+ "learning_rate": 3.808008041547998e-05,
573
+ "loss": 1.9215,
574
+ "step": 2850
575
+ },
576
+ {
577
+ "epoch": 0.49,
578
+ "learning_rate": 3.7870665103032336e-05,
579
+ "loss": 1.975,
580
+ "step": 2900
581
+ },
582
+ {
583
+ "epoch": 0.49,
584
+ "eval_loss": 1.026584267616272,
585
+ "eval_runtime": 16.0597,
586
+ "eval_samples_per_second": 579.091,
587
+ "eval_steps_per_second": 2.304,
588
+ "step": 2900
589
+ },
590
+ {
591
+ "epoch": 0.49,
592
+ "learning_rate": 3.766124979058469e-05,
593
+ "loss": 1.9971,
594
+ "step": 2950
595
+ },
596
+ {
597
+ "epoch": 0.5,
598
+ "learning_rate": 3.745183447813705e-05,
599
+ "loss": 1.9633,
600
+ "step": 3000
601
+ },
602
+ {
603
+ "epoch": 0.5,
604
+ "eval_loss": 1.0441828966140747,
605
+ "eval_runtime": 15.8581,
606
+ "eval_samples_per_second": 586.449,
607
+ "eval_steps_per_second": 2.333,
608
+ "step": 3000
609
+ },
610
+ {
611
+ "epoch": 0.51,
612
+ "learning_rate": 3.7242419165689395e-05,
613
+ "loss": 1.9569,
614
+ "step": 3050
615
+ },
616
+ {
617
+ "epoch": 0.52,
618
+ "learning_rate": 3.703300385324175e-05,
619
+ "loss": 1.9916,
620
+ "step": 3100
621
+ },
622
+ {
623
+ "epoch": 0.52,
624
+ "eval_loss": 1.0376336574554443,
625
+ "eval_runtime": 15.252,
626
+ "eval_samples_per_second": 609.754,
627
+ "eval_steps_per_second": 2.426,
628
+ "step": 3100
629
+ },
630
+ {
631
+ "epoch": 0.53,
632
+ "learning_rate": 3.682777684704306e-05,
633
+ "loss": 1.9359,
634
+ "step": 3150
635
+ },
636
+ {
637
+ "epoch": 0.54,
638
+ "learning_rate": 3.6618361534595414e-05,
639
+ "loss": 1.9447,
640
+ "step": 3200
641
+ },
642
+ {
643
+ "epoch": 0.54,
644
+ "eval_loss": 0.9997221231460571,
645
+ "eval_runtime": 16.6296,
646
+ "eval_samples_per_second": 559.243,
647
+ "eval_steps_per_second": 2.225,
648
+ "step": 3200
649
+ },
650
+ {
651
+ "epoch": 0.54,
652
+ "learning_rate": 3.640894622214777e-05,
653
+ "loss": 1.9436,
654
+ "step": 3250
655
+ },
656
+ {
657
+ "epoch": 0.55,
658
+ "learning_rate": 3.619953090970012e-05,
659
+ "loss": 1.9924,
660
+ "step": 3300
661
+ },
662
+ {
663
+ "epoch": 0.55,
664
+ "eval_loss": 1.0513906478881836,
665
+ "eval_runtime": 16.0136,
666
+ "eval_samples_per_second": 580.755,
667
+ "eval_steps_per_second": 2.311,
668
+ "step": 3300
669
+ },
670
+ {
671
+ "epoch": 0.56,
672
+ "learning_rate": 3.599011559725247e-05,
673
+ "loss": 1.9793,
674
+ "step": 3350
675
+ },
676
+ {
677
+ "epoch": 0.57,
678
+ "learning_rate": 3.578070028480482e-05,
679
+ "loss": 1.9409,
680
+ "step": 3400
681
+ },
682
+ {
683
+ "epoch": 0.57,
684
+ "eval_loss": 1.056003212928772,
685
+ "eval_runtime": 16.6238,
686
+ "eval_samples_per_second": 559.44,
687
+ "eval_steps_per_second": 2.226,
688
+ "step": 3400
689
+ },
690
+ {
691
+ "epoch": 0.58,
692
+ "learning_rate": 3.557128497235718e-05,
693
+ "loss": 1.9694,
694
+ "step": 3450
695
+ },
696
+ {
697
+ "epoch": 0.59,
698
+ "learning_rate": 3.536186965990954e-05,
699
+ "loss": 1.9751,
700
+ "step": 3500
701
+ },
702
+ {
703
+ "epoch": 0.59,
704
+ "eval_loss": 1.035875678062439,
705
+ "eval_runtime": 15.8522,
706
+ "eval_samples_per_second": 586.668,
707
+ "eval_steps_per_second": 2.334,
708
+ "step": 3500
709
+ },
710
+ {
711
+ "epoch": 0.59,
712
+ "learning_rate": 3.515245434746189e-05,
713
+ "loss": 1.9504,
714
+ "step": 3550
715
+ },
716
+ {
717
+ "epoch": 0.6,
718
+ "learning_rate": 3.494303903501424e-05,
719
+ "loss": 1.9421,
720
+ "step": 3600
721
+ },
722
+ {
723
+ "epoch": 0.6,
724
+ "eval_loss": 1.0627715587615967,
725
+ "eval_runtime": 15.8545,
726
+ "eval_samples_per_second": 586.583,
727
+ "eval_steps_per_second": 2.334,
728
+ "step": 3600
729
+ },
730
+ {
731
+ "epoch": 0.61,
732
+ "learning_rate": 3.473362372256659e-05,
733
+ "loss": 1.9026,
734
+ "step": 3650
735
+ },
736
+ {
737
+ "epoch": 0.62,
738
+ "learning_rate": 3.452420841011895e-05,
739
+ "loss": 1.9501,
740
+ "step": 3700
741
+ },
742
+ {
743
+ "epoch": 0.62,
744
+ "eval_loss": 1.049263596534729,
745
+ "eval_runtime": 16.1043,
746
+ "eval_samples_per_second": 577.486,
747
+ "eval_steps_per_second": 2.298,
748
+ "step": 3700
749
+ },
750
+ {
751
+ "epoch": 0.63,
752
+ "learning_rate": 3.431479309767131e-05,
753
+ "loss": 1.9331,
754
+ "step": 3750
755
+ },
756
+ {
757
+ "epoch": 0.64,
758
+ "learning_rate": 3.4105377785223656e-05,
759
+ "loss": 1.9362,
760
+ "step": 3800
761
+ },
762
+ {
763
+ "epoch": 0.64,
764
+ "eval_loss": 1.039939045906067,
765
+ "eval_runtime": 16.1428,
766
+ "eval_samples_per_second": 576.107,
767
+ "eval_steps_per_second": 2.292,
768
+ "step": 3800
769
+ },
770
+ {
771
+ "epoch": 0.64,
772
+ "learning_rate": 3.389596247277601e-05,
773
+ "loss": 1.9174,
774
+ "step": 3850
775
+ },
776
+ {
777
+ "epoch": 0.65,
778
+ "learning_rate": 3.368654716032836e-05,
779
+ "loss": 1.94,
780
+ "step": 3900
781
+ },
782
+ {
783
+ "epoch": 0.65,
784
+ "eval_loss": 1.0347034931182861,
785
+ "eval_runtime": 16.2615,
786
+ "eval_samples_per_second": 571.902,
787
+ "eval_steps_per_second": 2.275,
788
+ "step": 3900
789
+ },
790
+ {
791
+ "epoch": 0.66,
792
+ "learning_rate": 3.3477131847880715e-05,
793
+ "loss": 1.9182,
794
+ "step": 3950
795
+ },
796
+ {
797
+ "epoch": 0.67,
798
+ "learning_rate": 3.326771653543308e-05,
799
+ "loss": 1.9499,
800
+ "step": 4000
801
+ },
802
+ {
803
+ "epoch": 0.67,
804
+ "eval_loss": 1.0576939582824707,
805
+ "eval_runtime": 16.282,
806
+ "eval_samples_per_second": 571.183,
807
+ "eval_steps_per_second": 2.272,
808
+ "step": 4000
809
+ },
810
+ {
811
+ "epoch": 0.68,
812
+ "learning_rate": 3.3058301222985426e-05,
813
+ "loss": 1.9325,
814
+ "step": 4050
815
+ },
816
+ {
817
+ "epoch": 0.69,
818
+ "learning_rate": 3.284888591053778e-05,
819
+ "loss": 1.9062,
820
+ "step": 4100
821
+ },
822
+ {
823
+ "epoch": 0.69,
824
+ "eval_loss": 1.0423039197921753,
825
+ "eval_runtime": 15.965,
826
+ "eval_samples_per_second": 582.524,
827
+ "eval_steps_per_second": 2.318,
828
+ "step": 4100
829
+ },
830
+ {
831
+ "epoch": 0.7,
832
+ "learning_rate": 3.263947059809013e-05,
833
+ "loss": 1.9215,
834
+ "step": 4150
835
+ },
836
+ {
837
+ "epoch": 0.7,
838
+ "learning_rate": 3.2430055285642485e-05,
839
+ "loss": 1.9026,
840
+ "step": 4200
841
+ },
842
+ {
843
+ "epoch": 0.7,
844
+ "eval_loss": 1.0391933917999268,
845
+ "eval_runtime": 14.8765,
846
+ "eval_samples_per_second": 625.146,
847
+ "eval_steps_per_second": 2.487,
848
+ "step": 4200
849
+ },
850
+ {
851
+ "epoch": 0.71,
852
+ "learning_rate": 3.222063997319485e-05,
853
+ "loss": 1.9491,
854
+ "step": 4250
855
+ },
856
+ {
857
+ "epoch": 0.72,
858
+ "learning_rate": 3.2011224660747195e-05,
859
+ "loss": 1.9152,
860
+ "step": 4300
861
+ },
862
+ {
863
+ "epoch": 0.72,
864
+ "eval_loss": 1.0235393047332764,
865
+ "eval_runtime": 15.2908,
866
+ "eval_samples_per_second": 608.21,
867
+ "eval_steps_per_second": 2.42,
868
+ "step": 4300
869
+ },
870
+ {
871
+ "epoch": 0.73,
872
+ "learning_rate": 3.180180934829955e-05,
873
+ "loss": 1.9233,
874
+ "step": 4350
875
+ },
876
+ {
877
+ "epoch": 0.74,
878
+ "learning_rate": 3.15923940358519e-05,
879
+ "loss": 1.9279,
880
+ "step": 4400
881
+ },
882
+ {
883
+ "epoch": 0.74,
884
+ "eval_loss": 1.028559923171997,
885
+ "eval_runtime": 14.9232,
886
+ "eval_samples_per_second": 623.192,
887
+ "eval_steps_per_second": 2.479,
888
+ "step": 4400
889
+ },
890
+ {
891
+ "epoch": 0.75,
892
+ "learning_rate": 3.1382978723404254e-05,
893
+ "loss": 1.9025,
894
+ "step": 4450
895
+ },
896
+ {
897
+ "epoch": 0.75,
898
+ "learning_rate": 3.1173563410956616e-05,
899
+ "loss": 1.9195,
900
+ "step": 4500
901
+ },
902
+ {
903
+ "epoch": 0.75,
904
+ "eval_loss": 1.025765061378479,
905
+ "eval_runtime": 14.9794,
906
+ "eval_samples_per_second": 620.854,
907
+ "eval_steps_per_second": 2.47,
908
+ "step": 4500
909
+ },
910
+ {
911
+ "epoch": 0.76,
912
+ "learning_rate": 3.0964148098508965e-05,
913
+ "loss": 1.9,
914
+ "step": 4550
915
+ },
916
+ {
917
+ "epoch": 0.77,
918
+ "learning_rate": 3.075473278606132e-05,
919
+ "loss": 1.9079,
920
+ "step": 4600
921
+ },
922
+ {
923
+ "epoch": 0.77,
924
+ "eval_loss": 1.0234707593917847,
925
+ "eval_runtime": 15.1735,
926
+ "eval_samples_per_second": 612.912,
927
+ "eval_steps_per_second": 2.438,
928
+ "step": 4600
929
+ },
930
+ {
931
+ "epoch": 0.78,
932
+ "learning_rate": 3.054531747361367e-05,
933
+ "loss": 1.8868,
934
+ "step": 4650
935
+ },
936
+ {
937
+ "epoch": 0.79,
938
+ "learning_rate": 3.0335902161166024e-05,
939
+ "loss": 1.9096,
940
+ "step": 4700
941
+ },
942
+ {
943
+ "epoch": 0.79,
944
+ "eval_loss": 1.0209083557128906,
945
+ "eval_runtime": 15.0076,
946
+ "eval_samples_per_second": 619.685,
947
+ "eval_steps_per_second": 2.465,
948
+ "step": 4700
949
+ },
950
+ {
951
+ "epoch": 0.8,
952
+ "learning_rate": 3.0126486848718375e-05,
953
+ "loss": 1.8942,
954
+ "step": 4750
955
+ },
956
+ {
957
+ "epoch": 0.8,
958
+ "learning_rate": 2.9917071536270734e-05,
959
+ "loss": 1.9036,
960
+ "step": 4800
961
+ },
962
+ {
963
+ "epoch": 0.8,
964
+ "eval_loss": 1.0443929433822632,
965
+ "eval_runtime": 14.8431,
966
+ "eval_samples_per_second": 626.553,
967
+ "eval_steps_per_second": 2.493,
968
+ "step": 4800
969
+ },
970
+ {
971
+ "epoch": 0.81,
972
+ "learning_rate": 2.970765622382309e-05,
973
+ "loss": 1.892,
974
+ "step": 4850
975
+ },
976
+ {
977
+ "epoch": 0.82,
978
+ "learning_rate": 2.949824091137544e-05,
979
+ "loss": 1.926,
980
+ "step": 4900
981
+ },
982
+ {
983
+ "epoch": 0.82,
984
+ "eval_loss": 1.041871190071106,
985
+ "eval_runtime": 15.1234,
986
+ "eval_samples_per_second": 614.939,
987
+ "eval_steps_per_second": 2.447,
988
+ "step": 4900
989
+ },
990
+ {
991
+ "epoch": 0.83,
992
+ "learning_rate": 2.9288825598927793e-05,
993
+ "loss": 1.8894,
994
+ "step": 4950
995
+ },
996
+ {
997
+ "epoch": 0.84,
998
+ "learning_rate": 2.9079410286480145e-05,
999
+ "loss": 1.9163,
1000
+ "step": 5000
1001
+ },
1002
+ {
1003
+ "epoch": 0.84,
1004
+ "eval_loss": 1.0646345615386963,
1005
+ "eval_runtime": 14.6224,
1006
+ "eval_samples_per_second": 636.01,
1007
+ "eval_steps_per_second": 2.53,
1008
+ "step": 5000
1009
+ },
1010
+ {
1011
+ "epoch": 0.85,
1012
+ "learning_rate": 2.8869994974032504e-05,
1013
+ "loss": 1.9106,
1014
+ "step": 5050
1015
+ },
1016
+ {
1017
+ "epoch": 0.85,
1018
+ "learning_rate": 2.866057966158486e-05,
1019
+ "loss": 1.8973,
1020
+ "step": 5100
1021
+ },
1022
+ {
1023
+ "epoch": 0.85,
1024
+ "eval_loss": 1.0399593114852905,
1025
+ "eval_runtime": 14.8746,
1026
+ "eval_samples_per_second": 625.228,
1027
+ "eval_steps_per_second": 2.487,
1028
+ "step": 5100
1029
+ },
1030
+ {
1031
+ "epoch": 0.86,
1032
+ "learning_rate": 2.845116434913721e-05,
1033
+ "loss": 1.9107,
1034
+ "step": 5150
1035
+ },
1036
+ {
1037
+ "epoch": 0.87,
1038
+ "learning_rate": 2.8241749036689562e-05,
1039
+ "loss": 1.9189,
1040
+ "step": 5200
1041
+ },
1042
+ {
1043
+ "epoch": 0.87,
1044
+ "eval_loss": 1.0655252933502197,
1045
+ "eval_runtime": 15.0739,
1046
+ "eval_samples_per_second": 616.962,
1047
+ "eval_steps_per_second": 2.455,
1048
+ "step": 5200
1049
+ },
1050
+ {
1051
+ "epoch": 0.88,
1052
+ "learning_rate": 2.8032333724241914e-05,
1053
+ "loss": 1.9153,
1054
+ "step": 5250
1055
+ },
1056
+ {
1057
+ "epoch": 0.89,
1058
+ "learning_rate": 2.7822918411794273e-05,
1059
+ "loss": 1.9365,
1060
+ "step": 5300
1061
+ },
1062
+ {
1063
+ "epoch": 0.89,
1064
+ "eval_loss": 1.0276598930358887,
1065
+ "eval_runtime": 15.0216,
1066
+ "eval_samples_per_second": 619.109,
1067
+ "eval_steps_per_second": 2.463,
1068
+ "step": 5300
1069
+ },
1070
+ {
1071
+ "epoch": 0.9,
1072
+ "learning_rate": 2.7613503099346628e-05,
1073
+ "loss": 1.8813,
1074
+ "step": 5350
1075
+ },
1076
+ {
1077
+ "epoch": 0.9,
1078
+ "learning_rate": 2.740408778689898e-05,
1079
+ "loss": 1.8907,
1080
+ "step": 5400
1081
+ },
1082
+ {
1083
+ "epoch": 0.9,
1084
+ "eval_loss": 1.0385425090789795,
1085
+ "eval_runtime": 15.1648,
1086
+ "eval_samples_per_second": 613.264,
1087
+ "eval_steps_per_second": 2.44,
1088
+ "step": 5400
1089
+ },
1090
+ {
1091
+ "epoch": 0.91,
1092
+ "learning_rate": 2.7194672474451332e-05,
1093
+ "loss": 1.9187,
1094
+ "step": 5450
1095
+ },
1096
+ {
1097
+ "epoch": 0.92,
1098
+ "learning_rate": 2.6985257162003684e-05,
1099
+ "loss": 1.8917,
1100
+ "step": 5500
1101
+ },
1102
+ {
1103
+ "epoch": 0.92,
1104
+ "eval_loss": 1.0304224491119385,
1105
+ "eval_runtime": 14.8782,
1106
+ "eval_samples_per_second": 625.077,
1107
+ "eval_steps_per_second": 2.487,
1108
+ "step": 5500
1109
+ },
1110
+ {
1111
+ "epoch": 0.93,
1112
+ "learning_rate": 2.6780030155804992e-05,
1113
+ "loss": 1.8612,
1114
+ "step": 5550
1115
+ },
1116
+ {
1117
+ "epoch": 0.94,
1118
+ "learning_rate": 2.657061484335735e-05,
1119
+ "loss": 1.9204,
1120
+ "step": 5600
1121
+ },
1122
+ {
1123
+ "epoch": 0.94,
1124
+ "eval_loss": 1.0601547956466675,
1125
+ "eval_runtime": 14.8806,
1126
+ "eval_samples_per_second": 624.975,
1127
+ "eval_steps_per_second": 2.486,
1128
+ "step": 5600
1129
+ },
1130
+ {
1131
+ "epoch": 0.95,
1132
+ "learning_rate": 2.6361199530909702e-05,
1133
+ "loss": 1.8942,
1134
+ "step": 5650
1135
+ },
1136
+ {
1137
+ "epoch": 0.95,
1138
+ "learning_rate": 2.6151784218462054e-05,
1139
+ "loss": 1.9072,
1140
+ "step": 5700
1141
+ },
1142
+ {
1143
+ "epoch": 0.95,
1144
+ "eval_loss": 1.0364910364151,
1145
+ "eval_runtime": 14.8508,
1146
+ "eval_samples_per_second": 626.23,
1147
+ "eval_steps_per_second": 2.491,
1148
+ "step": 5700
1149
+ },
1150
+ {
1151
+ "epoch": 0.96,
1152
+ "learning_rate": 2.594236890601441e-05,
1153
+ "loss": 1.904,
1154
+ "step": 5750
1155
+ },
1156
+ {
1157
+ "epoch": 0.97,
1158
+ "learning_rate": 2.573295359356676e-05,
1159
+ "loss": 1.8952,
1160
+ "step": 5800
1161
+ },
1162
+ {
1163
+ "epoch": 0.97,
1164
+ "eval_loss": 1.0579551458358765,
1165
+ "eval_runtime": 15.8601,
1166
+ "eval_samples_per_second": 586.376,
1167
+ "eval_steps_per_second": 2.333,
1168
+ "step": 5800
1169
+ },
1170
+ {
1171
+ "epoch": 0.98,
1172
+ "learning_rate": 2.552353828111912e-05,
1173
+ "loss": 1.8922,
1174
+ "step": 5850
1175
+ },
1176
+ {
1177
+ "epoch": 0.99,
1178
+ "learning_rate": 2.5314122968671472e-05,
1179
+ "loss": 1.9074,
1180
+ "step": 5900
1181
+ },
1182
+ {
1183
+ "epoch": 0.99,
1184
+ "eval_loss": 1.0638257265090942,
1185
+ "eval_runtime": 14.9183,
1186
+ "eval_samples_per_second": 623.397,
1187
+ "eval_steps_per_second": 2.48,
1188
+ "step": 5900
1189
+ },
1190
+ {
1191
+ "epoch": 1.0,
1192
+ "learning_rate": 2.5104707656223824e-05,
1193
+ "loss": 1.8948,
1194
+ "step": 5950
1195
+ },
1196
+ {
1197
+ "epoch": 1.01,
1198
+ "learning_rate": 2.489529234377618e-05,
1199
+ "loss": 1.8758,
1200
+ "step": 6000
1201
+ },
1202
+ {
1203
+ "epoch": 1.01,
1204
+ "eval_loss": 1.0558853149414062,
1205
+ "eval_runtime": 14.5514,
1206
+ "eval_samples_per_second": 639.112,
1207
+ "eval_steps_per_second": 2.543,
1208
+ "step": 6000
1209
+ },
1210
+ {
1211
+ "epoch": 1.01,
1212
+ "learning_rate": 2.4685877031328534e-05,
1213
+ "loss": 1.8458,
1214
+ "step": 6050
1215
+ },
1216
+ {
1217
+ "epoch": 1.02,
1218
+ "learning_rate": 2.4476461718880886e-05,
1219
+ "loss": 1.8118,
1220
+ "step": 6100
1221
+ },
1222
+ {
1223
+ "epoch": 1.02,
1224
+ "eval_loss": 1.0648460388183594,
1225
+ "eval_runtime": 14.9731,
1226
+ "eval_samples_per_second": 621.114,
1227
+ "eval_steps_per_second": 2.471,
1228
+ "step": 6100
1229
+ },
1230
+ {
1231
+ "epoch": 1.03,
1232
+ "learning_rate": 2.4267046406433238e-05,
1233
+ "loss": 1.8216,
1234
+ "step": 6150
1235
+ },
1236
+ {
1237
+ "epoch": 1.04,
1238
+ "learning_rate": 2.4057631093985593e-05,
1239
+ "loss": 1.8181,
1240
+ "step": 6200
1241
+ },
1242
+ {
1243
+ "epoch": 1.04,
1244
+ "eval_loss": 1.0505411624908447,
1245
+ "eval_runtime": 14.9571,
1246
+ "eval_samples_per_second": 621.78,
1247
+ "eval_steps_per_second": 2.474,
1248
+ "step": 6200
1249
+ },
1250
+ {
1251
+ "epoch": 1.05,
1252
+ "learning_rate": 2.3848215781537948e-05,
1253
+ "loss": 1.8426,
1254
+ "step": 6250
1255
+ },
1256
+ {
1257
+ "epoch": 1.06,
1258
+ "learning_rate": 2.36388004690903e-05,
1259
+ "loss": 1.8481,
1260
+ "step": 6300
1261
+ },
1262
+ {
1263
+ "epoch": 1.06,
1264
+ "eval_loss": 1.0492850542068481,
1265
+ "eval_runtime": 14.8837,
1266
+ "eval_samples_per_second": 624.843,
1267
+ "eval_steps_per_second": 2.486,
1268
+ "step": 6300
1269
+ },
1270
+ {
1271
+ "epoch": 1.06,
1272
+ "learning_rate": 2.3429385156642655e-05,
1273
+ "loss": 1.8044,
1274
+ "step": 6350
1275
+ },
1276
+ {
1277
+ "epoch": 1.07,
1278
+ "learning_rate": 2.3219969844195007e-05,
1279
+ "loss": 1.8236,
1280
+ "step": 6400
1281
+ },
1282
+ {
1283
+ "epoch": 1.07,
1284
+ "eval_loss": 1.0615003108978271,
1285
+ "eval_runtime": 14.7795,
1286
+ "eval_samples_per_second": 629.251,
1287
+ "eval_steps_per_second": 2.503,
1288
+ "step": 6400
1289
+ },
1290
+ {
1291
+ "epoch": 1.08,
1292
+ "learning_rate": 2.3010554531747362e-05,
1293
+ "loss": 1.817,
1294
+ "step": 6450
1295
+ },
1296
+ {
1297
+ "epoch": 1.09,
1298
+ "learning_rate": 2.2801139219299718e-05,
1299
+ "loss": 1.8547,
1300
+ "step": 6500
1301
+ },
1302
+ {
1303
+ "epoch": 1.09,
1304
+ "eval_loss": 1.070224404335022,
1305
+ "eval_runtime": 15.1165,
1306
+ "eval_samples_per_second": 615.223,
1307
+ "eval_steps_per_second": 2.448,
1308
+ "step": 6500
1309
+ },
1310
+ {
1311
+ "epoch": 1.1,
1312
+ "learning_rate": 2.259172390685207e-05,
1313
+ "loss": 1.817,
1314
+ "step": 6550
1315
+ },
1316
+ {
1317
+ "epoch": 1.11,
1318
+ "learning_rate": 2.2382308594404425e-05,
1319
+ "loss": 1.8073,
1320
+ "step": 6600
1321
+ },
1322
+ {
1323
+ "epoch": 1.11,
1324
+ "eval_loss": 1.0603699684143066,
1325
+ "eval_runtime": 15.0786,
1326
+ "eval_samples_per_second": 616.768,
1327
+ "eval_steps_per_second": 2.454,
1328
+ "step": 6600
1329
+ },
1330
+ {
1331
+ "epoch": 1.11,
1332
+ "learning_rate": 2.2172893281956777e-05,
1333
+ "loss": 1.8214,
1334
+ "step": 6650
1335
+ },
1336
+ {
1337
+ "epoch": 1.12,
1338
+ "learning_rate": 2.1963477969509132e-05,
1339
+ "loss": 1.8323,
1340
+ "step": 6700
1341
+ },
1342
+ {
1343
+ "epoch": 1.12,
1344
+ "eval_loss": 1.0744497776031494,
1345
+ "eval_runtime": 15.0113,
1346
+ "eval_samples_per_second": 619.535,
1347
+ "eval_steps_per_second": 2.465,
1348
+ "step": 6700
1349
+ },
1350
+ {
1351
+ "epoch": 1.13,
1352
+ "learning_rate": 2.1754062657061487e-05,
1353
+ "loss": 1.8072,
1354
+ "step": 6750
1355
+ },
1356
+ {
1357
+ "epoch": 1.14,
1358
+ "learning_rate": 2.154464734461384e-05,
1359
+ "loss": 1.8291,
1360
+ "step": 6800
1361
+ },
1362
+ {
1363
+ "epoch": 1.14,
1364
+ "eval_loss": 1.0766249895095825,
1365
+ "eval_runtime": 15.0159,
1366
+ "eval_samples_per_second": 619.343,
1367
+ "eval_steps_per_second": 2.464,
1368
+ "step": 6800
1369
+ },
1370
+ {
1371
+ "epoch": 1.15,
1372
+ "learning_rate": 2.1335232032166194e-05,
1373
+ "loss": 1.856,
1374
+ "step": 6850
1375
+ },
1376
+ {
1377
+ "epoch": 1.16,
1378
+ "learning_rate": 2.1125816719718546e-05,
1379
+ "loss": 1.8066,
1380
+ "step": 6900
1381
+ },
1382
+ {
1383
+ "epoch": 1.16,
1384
+ "eval_loss": 1.0853835344314575,
1385
+ "eval_runtime": 14.9684,
1386
+ "eval_samples_per_second": 621.309,
1387
+ "eval_steps_per_second": 2.472,
1388
+ "step": 6900
1389
+ },
1390
+ {
1391
+ "epoch": 1.16,
1392
+ "learning_rate": 2.09164014072709e-05,
1393
+ "loss": 1.8015,
1394
+ "step": 6950
1395
+ },
1396
+ {
1397
+ "epoch": 1.17,
1398
+ "learning_rate": 2.0706986094823257e-05,
1399
+ "loss": 1.8383,
1400
+ "step": 7000
1401
+ },
1402
+ {
1403
+ "epoch": 1.17,
1404
+ "eval_loss": 1.0606310367584229,
1405
+ "eval_runtime": 14.9212,
1406
+ "eval_samples_per_second": 623.274,
1407
+ "eval_steps_per_second": 2.48,
1408
+ "step": 7000
1409
+ },
1410
+ {
1411
+ "epoch": 1.18,
1412
+ "learning_rate": 2.049757078237561e-05,
1413
+ "loss": 1.8252,
1414
+ "step": 7050
1415
+ },
1416
+ {
1417
+ "epoch": 1.19,
1418
+ "learning_rate": 2.028815546992796e-05,
1419
+ "loss": 1.8075,
1420
+ "step": 7100
1421
+ },
1422
+ {
1423
+ "epoch": 1.19,
1424
+ "eval_loss": 1.0541033744812012,
1425
+ "eval_runtime": 15.0551,
1426
+ "eval_samples_per_second": 617.733,
1427
+ "eval_steps_per_second": 2.458,
1428
+ "step": 7100
1429
+ },
1430
+ {
1431
+ "epoch": 1.2,
1432
+ "learning_rate": 2.0078740157480316e-05,
1433
+ "loss": 1.8103,
1434
+ "step": 7150
1435
+ },
1436
+ {
1437
+ "epoch": 1.21,
1438
+ "learning_rate": 1.986932484503267e-05,
1439
+ "loss": 1.8075,
1440
+ "step": 7200
1441
+ },
1442
+ {
1443
+ "epoch": 1.21,
1444
+ "eval_loss": 1.0540446043014526,
1445
+ "eval_runtime": 14.9234,
1446
+ "eval_samples_per_second": 623.184,
1447
+ "eval_steps_per_second": 2.479,
1448
+ "step": 7200
1449
+ },
1450
+ {
1451
+ "epoch": 1.21,
1452
+ "learning_rate": 1.9659909532585026e-05,
1453
+ "loss": 1.8457,
1454
+ "step": 7250
1455
+ },
1456
+ {
1457
+ "epoch": 1.22,
1458
+ "learning_rate": 1.9450494220137378e-05,
1459
+ "loss": 1.8445,
1460
+ "step": 7300
1461
+ },
1462
+ {
1463
+ "epoch": 1.22,
1464
+ "eval_loss": 1.0582592487335205,
1465
+ "eval_runtime": 15.172,
1466
+ "eval_samples_per_second": 612.97,
1467
+ "eval_steps_per_second": 2.439,
1468
+ "step": 7300
1469
+ },
1470
+ {
1471
+ "epoch": 1.23,
1472
+ "learning_rate": 1.924107890768973e-05,
1473
+ "loss": 1.831,
1474
+ "step": 7350
1475
+ },
1476
+ {
1477
+ "epoch": 1.24,
1478
+ "learning_rate": 1.9031663595242085e-05,
1479
+ "loss": 1.8413,
1480
+ "step": 7400
1481
+ },
1482
+ {
1483
+ "epoch": 1.24,
1484
+ "eval_loss": 1.0817487239837646,
1485
+ "eval_runtime": 14.9359,
1486
+ "eval_samples_per_second": 622.66,
1487
+ "eval_steps_per_second": 2.477,
1488
+ "step": 7400
1489
+ },
1490
+ {
1491
+ "epoch": 1.25,
1492
+ "learning_rate": 1.8822248282794437e-05,
1493
+ "loss": 1.8035,
1494
+ "step": 7450
1495
+ },
1496
+ {
1497
+ "epoch": 1.26,
1498
+ "learning_rate": 1.8612832970346792e-05,
1499
+ "loss": 1.8143,
1500
+ "step": 7500
1501
+ },
1502
+ {
1503
+ "epoch": 1.26,
1504
+ "eval_loss": 1.0620262622833252,
1505
+ "eval_runtime": 14.9509,
1506
+ "eval_samples_per_second": 622.036,
1507
+ "eval_steps_per_second": 2.475,
1508
+ "step": 7500
1509
+ },
1510
+ {
1511
+ "epoch": 1.26,
1512
+ "learning_rate": 1.8403417657899147e-05,
1513
+ "loss": 1.8111,
1514
+ "step": 7550
1515
+ },
1516
+ {
1517
+ "epoch": 1.27,
1518
+ "learning_rate": 1.81940023454515e-05,
1519
+ "loss": 1.8291,
1520
+ "step": 7600
1521
+ },
1522
+ {
1523
+ "epoch": 1.27,
1524
+ "eval_loss": 1.0485789775848389,
1525
+ "eval_runtime": 15.2218,
1526
+ "eval_samples_per_second": 610.966,
1527
+ "eval_steps_per_second": 2.431,
1528
+ "step": 7600
1529
+ },
1530
+ {
1531
+ "epoch": 1.28,
1532
+ "learning_rate": 1.7984587033003854e-05,
1533
+ "loss": 1.8267,
1534
+ "step": 7650
1535
+ },
1536
+ {
1537
+ "epoch": 1.29,
1538
+ "learning_rate": 1.7775171720556206e-05,
1539
+ "loss": 1.8234,
1540
+ "step": 7700
1541
+ },
1542
+ {
1543
+ "epoch": 1.29,
1544
+ "eval_loss": 1.0669416189193726,
1545
+ "eval_runtime": 14.7589,
1546
+ "eval_samples_per_second": 630.128,
1547
+ "eval_steps_per_second": 2.507,
1548
+ "step": 7700
1549
+ },
1550
+ {
1551
+ "epoch": 1.3,
1552
+ "learning_rate": 1.756575640810856e-05,
1553
+ "loss": 1.8435,
1554
+ "step": 7750
1555
+ },
1556
+ {
1557
+ "epoch": 1.31,
1558
+ "learning_rate": 1.7356341095660917e-05,
1559
+ "loss": 1.7901,
1560
+ "step": 7800
1561
+ },
1562
+ {
1563
+ "epoch": 1.31,
1564
+ "eval_loss": 1.0574383735656738,
1565
+ "eval_runtime": 15.1846,
1566
+ "eval_samples_per_second": 612.462,
1567
+ "eval_steps_per_second": 2.437,
1568
+ "step": 7800
1569
+ },
1570
+ {
1571
+ "epoch": 1.32,
1572
+ "learning_rate": 1.714692578321327e-05,
1573
+ "loss": 1.8135,
1574
+ "step": 7850
1575
+ },
1576
+ {
1577
+ "epoch": 1.32,
1578
+ "learning_rate": 1.693751047076562e-05,
1579
+ "loss": 1.8241,
1580
+ "step": 7900
1581
+ },
1582
+ {
1583
+ "epoch": 1.32,
1584
+ "eval_loss": 1.0570663213729858,
1585
+ "eval_runtime": 14.8346,
1586
+ "eval_samples_per_second": 626.913,
1587
+ "eval_steps_per_second": 2.494,
1588
+ "step": 7900
1589
+ },
1590
+ {
1591
+ "epoch": 1.33,
1592
+ "learning_rate": 1.6728095158317976e-05,
1593
+ "loss": 1.8,
1594
+ "step": 7950
1595
+ },
1596
+ {
1597
+ "epoch": 1.34,
1598
+ "learning_rate": 1.651867984587033e-05,
1599
+ "loss": 1.8202,
1600
+ "step": 8000
1601
+ },
1602
+ {
1603
+ "epoch": 1.34,
1604
+ "eval_loss": 1.052140235900879,
1605
+ "eval_runtime": 15.1776,
1606
+ "eval_samples_per_second": 612.744,
1607
+ "eval_steps_per_second": 2.438,
1608
+ "step": 8000
1609
+ }
1610
+ ],
1611
+ "max_steps": 11938,
1612
+ "num_train_epochs": 2,
1613
+ "total_flos": 1.4030127364571136e+18,
1614
+ "trial_name": null,
1615
+ "trial_params": null
1616
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67d6a9068328893ac795bd5007ade8cb3bdfdf14743bd5c5f907f34834d995b1
3
+ size 2991
vocab.json ADDED
The diff for this file is too large to render. See raw diff