tscholak commited on
Commit
c9beb6e
1 Parent(s): 57782b3

add T5 model

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
1
+ {" <": 32101, " <=": 32100}
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-3b",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 16384,
7
+ "d_kv": 128,
8
+ "d_model": 1024,
9
+ "decoder_start_token_id": 0,
10
+ "diversity_penalty": null,
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "relu",
14
+ "gradient_checkpointing": false,
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "max_length": 512,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 24,
22
+ "num_heads": 32,
23
+ "num_layers": 24,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_num_buckets": 32,
27
+ "task_specific_params": {
28
+ "summarization": {
29
+ "early_stopping": true,
30
+ "length_penalty": 2.0,
31
+ "max_length": 200,
32
+ "min_length": 30,
33
+ "no_repeat_ngram_size": 3,
34
+ "num_beams": 4,
35
+ "prefix": "summarize: "
36
+ },
37
+ "translation_en_to_de": {
38
+ "early_stopping": true,
39
+ "max_length": 300,
40
+ "num_beams": 4,
41
+ "prefix": "translate English to German: "
42
+ },
43
+ "translation_en_to_fr": {
44
+ "early_stopping": true,
45
+ "max_length": 300,
46
+ "num_beams": 4,
47
+ "prefix": "translate English to French: "
48
+ },
49
+ "translation_en_to_ro": {
50
+ "early_stopping": true,
51
+ "max_length": 300,
52
+ "num_beams": 4,
53
+ "prefix": "translate English to Romanian: "
54
+ }
55
+ },
56
+ "transformers_version": "4.6.0.dev0",
57
+ "use_cache": true,
58
+ "vocab_size": 32102
59
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1664ca859e52d1cd96dc25f42599f6bf3596b8d5c75f2d1ba9db597381ce3f64
3
+ size 13641236
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b4a0f6023f0549a0bd428abd808116ca9b02cccff94e8244e1bd7a121aaa559
3
+ size 11406535108
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c647f865dbdd3c524e97e5f8e73ff48c5c50afdd9820fc163c01eb594b00ed
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "t5-3b"}
trainer_state.json ADDED
@@ -0,0 +1,1790 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 362.58514285714284,
5
+ "global_step": 1088,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.29,
12
+ "learning_rate": 0.0001,
13
+ "loss": 3.7248,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 1.29,
18
+ "learning_rate": 0.0001,
19
+ "loss": 2.1406,
20
+ "step": 4
21
+ },
22
+ {
23
+ "epoch": 2.59,
24
+ "learning_rate": 0.0001,
25
+ "loss": 1.0705,
26
+ "step": 8
27
+ },
28
+ {
29
+ "epoch": 3.88,
30
+ "learning_rate": 0.0001,
31
+ "loss": 0.6039,
32
+ "step": 12
33
+ },
34
+ {
35
+ "epoch": 5.29,
36
+ "learning_rate": 0.0001,
37
+ "loss": 0.4378,
38
+ "step": 16
39
+ },
40
+ {
41
+ "epoch": 6.59,
42
+ "learning_rate": 0.0001,
43
+ "loss": 0.3218,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 7.88,
48
+ "learning_rate": 0.0001,
49
+ "loss": 0.2751,
50
+ "step": 24
51
+ },
52
+ {
53
+ "epoch": 9.29,
54
+ "learning_rate": 0.0001,
55
+ "loss": 0.2423,
56
+ "step": 28
57
+ },
58
+ {
59
+ "epoch": 10.59,
60
+ "learning_rate": 0.0001,
61
+ "loss": 0.1763,
62
+ "step": 32
63
+ },
64
+ {
65
+ "epoch": 11.88,
66
+ "learning_rate": 0.0001,
67
+ "loss": 0.1574,
68
+ "step": 36
69
+ },
70
+ {
71
+ "epoch": 13.29,
72
+ "learning_rate": 0.0001,
73
+ "loss": 0.137,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 14.59,
78
+ "learning_rate": 0.0001,
79
+ "loss": 0.1274,
80
+ "step": 44
81
+ },
82
+ {
83
+ "epoch": 15.88,
84
+ "learning_rate": 0.0001,
85
+ "loss": 0.141,
86
+ "step": 48
87
+ },
88
+ {
89
+ "epoch": 17.29,
90
+ "learning_rate": 0.0001,
91
+ "loss": 0.1345,
92
+ "step": 52
93
+ },
94
+ {
95
+ "epoch": 18.59,
96
+ "learning_rate": 0.0001,
97
+ "loss": 0.0848,
98
+ "step": 56
99
+ },
100
+ {
101
+ "epoch": 19.88,
102
+ "learning_rate": 0.0001,
103
+ "loss": 0.0719,
104
+ "step": 60
105
+ },
106
+ {
107
+ "epoch": 21.29,
108
+ "learning_rate": 0.0001,
109
+ "loss": 0.0814,
110
+ "step": 64
111
+ },
112
+ {
113
+ "epoch": 21.29,
114
+ "eval_exact_match": 0.5145067698259188,
115
+ "eval_loss": 0.15547168254852295,
116
+ "eval_runtime": 2276.8146,
117
+ "eval_samples_per_second": 0.454,
118
+ "step": 64
119
+ },
120
+ {
121
+ "epoch": 22.59,
122
+ "learning_rate": 0.0001,
123
+ "loss": 0.0665,
124
+ "step": 68
125
+ },
126
+ {
127
+ "epoch": 23.88,
128
+ "learning_rate": 0.0001,
129
+ "loss": 0.0593,
130
+ "step": 72
131
+ },
132
+ {
133
+ "epoch": 25.29,
134
+ "learning_rate": 0.0001,
135
+ "loss": 0.0652,
136
+ "step": 76
137
+ },
138
+ {
139
+ "epoch": 26.59,
140
+ "learning_rate": 0.0001,
141
+ "loss": 0.0896,
142
+ "step": 80
143
+ },
144
+ {
145
+ "epoch": 27.88,
146
+ "learning_rate": 0.0001,
147
+ "loss": 0.0483,
148
+ "step": 84
149
+ },
150
+ {
151
+ "epoch": 29.29,
152
+ "learning_rate": 0.0001,
153
+ "loss": 0.0497,
154
+ "step": 88
155
+ },
156
+ {
157
+ "epoch": 30.59,
158
+ "learning_rate": 0.0001,
159
+ "loss": 0.0479,
160
+ "step": 92
161
+ },
162
+ {
163
+ "epoch": 31.88,
164
+ "learning_rate": 0.0001,
165
+ "loss": 0.0376,
166
+ "step": 96
167
+ },
168
+ {
169
+ "epoch": 33.29,
170
+ "learning_rate": 0.0001,
171
+ "loss": 0.0374,
172
+ "step": 100
173
+ },
174
+ {
175
+ "epoch": 34.59,
176
+ "learning_rate": 0.0001,
177
+ "loss": 0.0342,
178
+ "step": 104
179
+ },
180
+ {
181
+ "epoch": 35.88,
182
+ "learning_rate": 0.0001,
183
+ "loss": 0.0354,
184
+ "step": 108
185
+ },
186
+ {
187
+ "epoch": 37.29,
188
+ "learning_rate": 0.0001,
189
+ "loss": 0.029,
190
+ "step": 112
191
+ },
192
+ {
193
+ "epoch": 38.59,
194
+ "learning_rate": 0.0001,
195
+ "loss": 0.0274,
196
+ "step": 116
197
+ },
198
+ {
199
+ "epoch": 39.88,
200
+ "learning_rate": 0.0001,
201
+ "loss": 0.024,
202
+ "step": 120
203
+ },
204
+ {
205
+ "epoch": 41.29,
206
+ "learning_rate": 0.0001,
207
+ "loss": 0.0331,
208
+ "step": 124
209
+ },
210
+ {
211
+ "epoch": 42.59,
212
+ "learning_rate": 0.0001,
213
+ "loss": 0.0204,
214
+ "step": 128
215
+ },
216
+ {
217
+ "epoch": 42.59,
218
+ "eval_exact_match": 0.6392649903288201,
219
+ "eval_loss": 0.19595815241336823,
220
+ "eval_runtime": 1825.3052,
221
+ "eval_samples_per_second": 0.566,
222
+ "step": 128
223
+ },
224
+ {
225
+ "epoch": 43.88,
226
+ "learning_rate": 0.0001,
227
+ "loss": 0.0191,
228
+ "step": 132
229
+ },
230
+ {
231
+ "epoch": 45.29,
232
+ "learning_rate": 0.0001,
233
+ "loss": 0.019,
234
+ "step": 136
235
+ },
236
+ {
237
+ "epoch": 46.59,
238
+ "learning_rate": 0.0001,
239
+ "loss": 0.0203,
240
+ "step": 140
241
+ },
242
+ {
243
+ "epoch": 47.88,
244
+ "learning_rate": 0.0001,
245
+ "loss": 0.0182,
246
+ "step": 144
247
+ },
248
+ {
249
+ "epoch": 49.29,
250
+ "learning_rate": 0.0001,
251
+ "loss": 0.0158,
252
+ "step": 148
253
+ },
254
+ {
255
+ "epoch": 50.59,
256
+ "learning_rate": 0.0001,
257
+ "loss": 0.0118,
258
+ "step": 152
259
+ },
260
+ {
261
+ "epoch": 51.88,
262
+ "learning_rate": 0.0001,
263
+ "loss": 0.0139,
264
+ "step": 156
265
+ },
266
+ {
267
+ "epoch": 53.29,
268
+ "learning_rate": 0.0001,
269
+ "loss": 0.016,
270
+ "step": 160
271
+ },
272
+ {
273
+ "epoch": 54.59,
274
+ "learning_rate": 0.0001,
275
+ "loss": 0.022,
276
+ "step": 164
277
+ },
278
+ {
279
+ "epoch": 55.88,
280
+ "learning_rate": 0.0001,
281
+ "loss": 0.0184,
282
+ "step": 168
283
+ },
284
+ {
285
+ "epoch": 57.29,
286
+ "learning_rate": 0.0001,
287
+ "loss": 0.0102,
288
+ "step": 172
289
+ },
290
+ {
291
+ "epoch": 58.59,
292
+ "learning_rate": 0.0001,
293
+ "loss": 0.0096,
294
+ "step": 176
295
+ },
296
+ {
297
+ "epoch": 59.88,
298
+ "learning_rate": 0.0001,
299
+ "loss": 0.0093,
300
+ "step": 180
301
+ },
302
+ {
303
+ "epoch": 61.29,
304
+ "learning_rate": 0.0001,
305
+ "loss": 0.0096,
306
+ "step": 184
307
+ },
308
+ {
309
+ "epoch": 62.59,
310
+ "learning_rate": 0.0001,
311
+ "loss": 0.0075,
312
+ "step": 188
313
+ },
314
+ {
315
+ "epoch": 63.88,
316
+ "learning_rate": 0.0001,
317
+ "loss": 0.0072,
318
+ "step": 192
319
+ },
320
+ {
321
+ "epoch": 63.88,
322
+ "eval_exact_match": 0.6460348162475822,
323
+ "eval_loss": 0.24530905485153198,
324
+ "eval_runtime": 1845.9477,
325
+ "eval_samples_per_second": 0.56,
326
+ "step": 192
327
+ },
328
+ {
329
+ "epoch": 65.29,
330
+ "learning_rate": 0.0001,
331
+ "loss": 0.007,
332
+ "step": 196
333
+ },
334
+ {
335
+ "epoch": 66.59,
336
+ "learning_rate": 0.0001,
337
+ "loss": 0.0091,
338
+ "step": 200
339
+ },
340
+ {
341
+ "epoch": 67.88,
342
+ "learning_rate": 0.0001,
343
+ "loss": 0.0082,
344
+ "step": 204
345
+ },
346
+ {
347
+ "epoch": 69.29,
348
+ "learning_rate": 0.0001,
349
+ "loss": 0.0063,
350
+ "step": 208
351
+ },
352
+ {
353
+ "epoch": 70.59,
354
+ "learning_rate": 0.0001,
355
+ "loss": 0.0447,
356
+ "step": 212
357
+ },
358
+ {
359
+ "epoch": 71.88,
360
+ "learning_rate": 0.0001,
361
+ "loss": 0.0114,
362
+ "step": 216
363
+ },
364
+ {
365
+ "epoch": 73.29,
366
+ "learning_rate": 0.0001,
367
+ "loss": 0.0065,
368
+ "step": 220
369
+ },
370
+ {
371
+ "epoch": 74.59,
372
+ "learning_rate": 0.0001,
373
+ "loss": 0.0046,
374
+ "step": 224
375
+ },
376
+ {
377
+ "epoch": 75.88,
378
+ "learning_rate": 0.0001,
379
+ "loss": 0.0054,
380
+ "step": 228
381
+ },
382
+ {
383
+ "epoch": 77.29,
384
+ "learning_rate": 0.0001,
385
+ "loss": 0.0048,
386
+ "step": 232
387
+ },
388
+ {
389
+ "epoch": 78.59,
390
+ "learning_rate": 0.0001,
391
+ "loss": 0.0051,
392
+ "step": 236
393
+ },
394
+ {
395
+ "epoch": 79.88,
396
+ "learning_rate": 0.0001,
397
+ "loss": 0.0041,
398
+ "step": 240
399
+ },
400
+ {
401
+ "epoch": 81.29,
402
+ "learning_rate": 0.0001,
403
+ "loss": 0.0044,
404
+ "step": 244
405
+ },
406
+ {
407
+ "epoch": 82.59,
408
+ "learning_rate": 0.0001,
409
+ "loss": 0.0222,
410
+ "step": 248
411
+ },
412
+ {
413
+ "epoch": 83.88,
414
+ "learning_rate": 0.0001,
415
+ "loss": 0.0049,
416
+ "step": 252
417
+ },
418
+ {
419
+ "epoch": 85.29,
420
+ "learning_rate": 0.0001,
421
+ "loss": 0.0632,
422
+ "step": 256
423
+ },
424
+ {
425
+ "epoch": 85.29,
426
+ "eval_exact_match": 0.648936170212766,
427
+ "eval_loss": 0.21414929628372192,
428
+ "eval_runtime": 1740.439,
429
+ "eval_samples_per_second": 0.594,
430
+ "step": 256
431
+ },
432
+ {
433
+ "epoch": 86.59,
434
+ "learning_rate": 0.0001,
435
+ "loss": 0.0277,
436
+ "step": 260
437
+ },
438
+ {
439
+ "epoch": 87.88,
440
+ "learning_rate": 0.0001,
441
+ "loss": 0.0074,
442
+ "step": 264
443
+ },
444
+ {
445
+ "epoch": 89.29,
446
+ "learning_rate": 0.0001,
447
+ "loss": 0.0041,
448
+ "step": 268
449
+ },
450
+ {
451
+ "epoch": 90.59,
452
+ "learning_rate": 0.0001,
453
+ "loss": 0.0027,
454
+ "step": 272
455
+ },
456
+ {
457
+ "epoch": 91.88,
458
+ "learning_rate": 0.0001,
459
+ "loss": 0.0026,
460
+ "step": 276
461
+ },
462
+ {
463
+ "epoch": 93.29,
464
+ "learning_rate": 0.0001,
465
+ "loss": 0.0024,
466
+ "step": 280
467
+ },
468
+ {
469
+ "epoch": 94.59,
470
+ "learning_rate": 0.0001,
471
+ "loss": 0.0021,
472
+ "step": 284
473
+ },
474
+ {
475
+ "epoch": 95.88,
476
+ "learning_rate": 0.0001,
477
+ "loss": 0.002,
478
+ "step": 288
479
+ },
480
+ {
481
+ "epoch": 97.29,
482
+ "learning_rate": 0.0001,
483
+ "loss": 0.002,
484
+ "step": 292
485
+ },
486
+ {
487
+ "epoch": 98.59,
488
+ "learning_rate": 0.0001,
489
+ "loss": 0.0018,
490
+ "step": 296
491
+ },
492
+ {
493
+ "epoch": 99.88,
494
+ "learning_rate": 0.0001,
495
+ "loss": 0.0017,
496
+ "step": 300
497
+ },
498
+ {
499
+ "epoch": 101.29,
500
+ "learning_rate": 0.0001,
501
+ "loss": 0.0019,
502
+ "step": 304
503
+ },
504
+ {
505
+ "epoch": 102.59,
506
+ "learning_rate": 0.0001,
507
+ "loss": 0.0017,
508
+ "step": 308
509
+ },
510
+ {
511
+ "epoch": 103.88,
512
+ "learning_rate": 0.0001,
513
+ "loss": 0.0016,
514
+ "step": 312
515
+ },
516
+ {
517
+ "epoch": 105.29,
518
+ "learning_rate": 0.0001,
519
+ "loss": 0.0018,
520
+ "step": 316
521
+ },
522
+ {
523
+ "epoch": 106.59,
524
+ "learning_rate": 0.0001,
525
+ "loss": 0.0015,
526
+ "step": 320
527
+ },
528
+ {
529
+ "epoch": 106.59,
530
+ "eval_exact_match": 0.6808510638297872,
531
+ "eval_loss": 0.2944816052913666,
532
+ "eval_runtime": 1854.2635,
533
+ "eval_samples_per_second": 0.558,
534
+ "step": 320
535
+ },
536
+ {
537
+ "epoch": 107.88,
538
+ "learning_rate": 0.0001,
539
+ "loss": 0.0013,
540
+ "step": 324
541
+ },
542
+ {
543
+ "epoch": 109.29,
544
+ "learning_rate": 0.0001,
545
+ "loss": 0.0014,
546
+ "step": 328
547
+ },
548
+ {
549
+ "epoch": 110.59,
550
+ "learning_rate": 0.0001,
551
+ "loss": 0.0017,
552
+ "step": 332
553
+ },
554
+ {
555
+ "epoch": 111.88,
556
+ "learning_rate": 0.0001,
557
+ "loss": 0.0013,
558
+ "step": 336
559
+ },
560
+ {
561
+ "epoch": 113.29,
562
+ "learning_rate": 0.0001,
563
+ "loss": 0.0013,
564
+ "step": 340
565
+ },
566
+ {
567
+ "epoch": 114.59,
568
+ "learning_rate": 0.0001,
569
+ "loss": 0.0011,
570
+ "step": 344
571
+ },
572
+ {
573
+ "epoch": 115.88,
574
+ "learning_rate": 0.0001,
575
+ "loss": 0.0015,
576
+ "step": 348
577
+ },
578
+ {
579
+ "epoch": 117.29,
580
+ "learning_rate": 0.0001,
581
+ "loss": 0.0021,
582
+ "step": 352
583
+ },
584
+ {
585
+ "epoch": 118.59,
586
+ "learning_rate": 0.0001,
587
+ "loss": 0.0013,
588
+ "step": 356
589
+ },
590
+ {
591
+ "epoch": 119.88,
592
+ "learning_rate": 0.0001,
593
+ "loss": 0.001,
594
+ "step": 360
595
+ },
596
+ {
597
+ "epoch": 121.29,
598
+ "learning_rate": 0.0001,
599
+ "loss": 0.0012,
600
+ "step": 364
601
+ },
602
+ {
603
+ "epoch": 122.59,
604
+ "learning_rate": 0.0001,
605
+ "loss": 0.001,
606
+ "step": 368
607
+ },
608
+ {
609
+ "epoch": 123.88,
610
+ "learning_rate": 0.0001,
611
+ "loss": 0.0009,
612
+ "step": 372
613
+ },
614
+ {
615
+ "epoch": 125.29,
616
+ "learning_rate": 0.0001,
617
+ "loss": 0.0013,
618
+ "step": 376
619
+ },
620
+ {
621
+ "epoch": 126.59,
622
+ "learning_rate": 0.0001,
623
+ "loss": 0.0012,
624
+ "step": 380
625
+ },
626
+ {
627
+ "epoch": 127.88,
628
+ "learning_rate": 0.0001,
629
+ "loss": 0.0009,
630
+ "step": 384
631
+ },
632
+ {
633
+ "epoch": 127.88,
634
+ "eval_exact_match": 0.6731141199226306,
635
+ "eval_loss": 0.31063422560691833,
636
+ "eval_runtime": 1905.6919,
637
+ "eval_samples_per_second": 0.543,
638
+ "step": 384
639
+ },
640
+ {
641
+ "epoch": 129.29,
642
+ "learning_rate": 0.0001,
643
+ "loss": 0.0012,
644
+ "step": 388
645
+ },
646
+ {
647
+ "epoch": 130.59,
648
+ "learning_rate": 0.0001,
649
+ "loss": 0.001,
650
+ "step": 392
651
+ },
652
+ {
653
+ "epoch": 131.88,
654
+ "learning_rate": 0.0001,
655
+ "loss": 0.0008,
656
+ "step": 396
657
+ },
658
+ {
659
+ "epoch": 133.29,
660
+ "learning_rate": 0.0001,
661
+ "loss": 0.0011,
662
+ "step": 400
663
+ },
664
+ {
665
+ "epoch": 134.59,
666
+ "learning_rate": 0.0001,
667
+ "loss": 0.0009,
668
+ "step": 404
669
+ },
670
+ {
671
+ "epoch": 135.88,
672
+ "learning_rate": 0.0001,
673
+ "loss": 0.0009,
674
+ "step": 408
675
+ },
676
+ {
677
+ "epoch": 137.29,
678
+ "learning_rate": 0.0001,
679
+ "loss": 0.0011,
680
+ "step": 412
681
+ },
682
+ {
683
+ "epoch": 138.59,
684
+ "learning_rate": 0.0001,
685
+ "loss": 0.0009,
686
+ "step": 416
687
+ },
688
+ {
689
+ "epoch": 139.88,
690
+ "learning_rate": 0.0001,
691
+ "loss": 0.001,
692
+ "step": 420
693
+ },
694
+ {
695
+ "epoch": 141.29,
696
+ "learning_rate": 0.0001,
697
+ "loss": 0.001,
698
+ "step": 424
699
+ },
700
+ {
701
+ "epoch": 142.59,
702
+ "learning_rate": 0.0001,
703
+ "loss