tscholak commited on
Commit
c9beb6e
1 Parent(s): 57782b3

add T5 model

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {" <": 32101, " <=": 32100}
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-3b",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 16384,
7
+ "d_kv": 128,
8
+ "d_model": 1024,
9
+ "decoder_start_token_id": 0,
10
+ "diversity_penalty": null,
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "relu",
14
+ "gradient_checkpointing": false,
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "max_length": 512,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 24,
22
+ "num_heads": 32,
23
+ "num_layers": 24,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_num_buckets": 32,
27
+ "task_specific_params": {
28
+ "summarization": {
29
+ "early_stopping": true,
30
+ "length_penalty": 2.0,
31
+ "max_length": 200,
32
+ "min_length": 30,
33
+ "no_repeat_ngram_size": 3,
34
+ "num_beams": 4,
35
+ "prefix": "summarize: "
36
+ },
37
+ "translation_en_to_de": {
38
+ "early_stopping": true,
39
+ "max_length": 300,
40
+ "num_beams": 4,
41
+ "prefix": "translate English to German: "
42
+ },
43
+ "translation_en_to_fr": {
44
+ "early_stopping": true,
45
+ "max_length": 300,
46
+ "num_beams": 4,
47
+ "prefix": "translate English to French: "
48
+ },
49
+ "translation_en_to_ro": {
50
+ "early_stopping": true,
51
+ "max_length": 300,
52
+ "num_beams": 4,
53
+ "prefix": "translate English to Romanian: "
54
+ }
55
+ },
56
+ "transformers_version": "4.6.0.dev0",
57
+ "use_cache": true,
58
+ "vocab_size": 32102
59
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1664ca859e52d1cd96dc25f42599f6bf3596b8d5c75f2d1ba9db597381ce3f64
3
+ size 13641236
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b4a0f6023f0549a0bd428abd808116ca9b02cccff94e8244e1bd7a121aaa559
3
+ size 11406535108
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c647f865dbdd3c524e97e5f8e73ff48c5c50afdd9820fc163c01eb594b00ed
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "t5-3b"}
trainer_state.json ADDED
@@ -0,0 +1,1790 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 362.58514285714284,
5
+ "global_step": 1088,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.29,
12
+ "learning_rate": 0.0001,
13
+ "loss": 3.7248,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 1.29,
18
+ "learning_rate": 0.0001,
19
+ "loss": 2.1406,
20
+ "step": 4
21
+ },
22
+ {
23
+ "epoch": 2.59,
24
+ "learning_rate": 0.0001,
25
+ "loss": 1.0705,
26
+ "step": 8
27
+ },
28
+ {
29
+ "epoch": 3.88,
30
+ "learning_rate": 0.0001,
31
+ "loss": 0.6039,
32
+ "step": 12
33
+ },
34
+ {
35
+ "epoch": 5.29,
36
+ "learning_rate": 0.0001,
37
+ "loss": 0.4378,
38
+ "step": 16
39
+ },
40
+ {
41
+ "epoch": 6.59,
42
+ "learning_rate": 0.0001,
43
+ "loss": 0.3218,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 7.88,
48
+ "learning_rate": 0.0001,
49
+ "loss": 0.2751,
50
+ "step": 24
51
+ },
52
+ {
53
+ "epoch": 9.29,
54
+ "learning_rate": 0.0001,
55
+ "loss": 0.2423,
56
+ "step": 28
57
+ },
58
+ {
59
+ "epoch": 10.59,
60
+ "learning_rate": 0.0001,
61
+ "loss": 0.1763,
62
+ "step": 32
63
+ },
64
+ {
65
+ "epoch": 11.88,
66
+ "learning_rate": 0.0001,
67
+ "loss": 0.1574,
68
+ "step": 36
69
+ },
70
+ {
71
+ "epoch": 13.29,
72
+ "learning_rate": 0.0001,
73
+ "loss": 0.137,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 14.59,
78
+ "learning_rate": 0.0001,
79
+ "loss": 0.1274,
80
+ "step": 44
81
+ },
82
+ {
83
+ "epoch": 15.88,
84
+ "learning_rate": 0.0001,
85
+ "loss": 0.141,
86
+ "step": 48
87
+ },
88
+ {
89
+ "epoch": 17.29,
90
+ "learning_rate": 0.0001,
91
+ "loss": 0.1345,
92
+ "step": 52
93
+ },
94
+ {
95
+ "epoch": 18.59,
96
+ "learning_rate": 0.0001,
97
+ "loss": 0.0848,
98
+ "step": 56
99
+ },
100
+ {
101
+ "epoch": 19.88,
102
+ "learning_rate": 0.0001,
103
+ "loss": 0.0719,
104
+ "step": 60
105
+ },
106
+ {
107
+ "epoch": 21.29,
108
+ "learning_rate": 0.0001,
109
+ "loss": 0.0814,
110
+ "step": 64
111
+ },
112
+ {
113
+ "epoch": 21.29,
114
+ "eval_exact_match": 0.5145067698259188,
115
+ "eval_loss": 0.15547168254852295,
116
+ "eval_runtime": 2276.8146,
117
+ "eval_samples_per_second": 0.454,
118
+ "step": 64
119
+ },
120
+ {
121
+ "epoch": 22.59,
122
+ "learning_rate": 0.0001,
123
+ "loss": 0.0665,
124
+ "step": 68
125
+ },
126
+ {
127
+ "epoch": 23.88,
128
+ "learning_rate": 0.0001,
129
+ "loss": 0.0593,
130
+ "step": 72
131
+ },
132
+ {
133
+ "epoch": 25.29,
134
+ "learning_rate": 0.0001,
135
+ "loss": 0.0652,
136
+ "step": 76
137
+ },
138
+ {
139
+ "epoch": 26.59,
140
+ "learning_rate": 0.0001,
141
+ "loss": 0.0896,
142
+ "step": 80
143
+ },
144
+ {
145
+ "epoch": 27.88,
146
+ "learning_rate": 0.0001,
147
+ "loss": 0.0483,
148
+ "step": 84
149
+ },
150
+ {
151
+ "epoch": 29.29,
152
+ "learning_rate": 0.0001,
153
+ "loss": 0.0497,
154
+ "step": 88
155
+ },
156
+ {
157
+ "epoch": 30.59,
158
+ "learning_rate": 0.0001,
159
+ "loss": 0.0479,
160
+ "step": 92
161
+ },
162
+ {
163
+ "epoch": 31.88,
164
+ "learning_rate": 0.0001,
165
+ "loss": 0.0376,
166
+ "step": 96
167
+ },
168
+ {
169
+ "epoch": 33.29,
170
+ "learning_rate": 0.0001,
171
+ "loss": 0.0374,
172
+ "step": 100
173
+ },
174
+ {
175
+ "epoch": 34.59,
176
+ "learning_rate": 0.0001,
177
+ "loss": 0.0342,
178
+ "step": 104
179
+ },
180
+ {
181
+ "epoch": 35.88,
182
+ "learning_rate": 0.0001,
183
+ "loss": 0.0354,
184
+ "step": 108
185
+ },
186
+ {
187
+ "epoch": 37.29,
188
+ "learning_rate": 0.0001,
189
+ "loss": 0.029,
190
+ "step": 112
191
+ },
192
+ {
193
+ "epoch": 38.59,
194
+ "learning_rate": 0.0001,
195
+ "loss": 0.0274,
196
+ "step": 116
197
+ },
198
+ {
199
+ "epoch": 39.88,
200
+ "learning_rate": 0.0001,
201
+ "loss": 0.024,
202
+ "step": 120
203
+ },
204
+ {
205
+ "epoch": 41.29,
206
+ "learning_rate": 0.0001,
207
+ "loss": 0.0331,
208
+ "step": 124
209
+ },
210
+ {
211
+ "epoch": 42.59,
212
+ "learning_rate": 0.0001,
213
+ "loss": 0.0204,
214
+ "step": 128
215
+ },
216
+ {
217
+ "epoch": 42.59,
218
+ "eval_exact_match": 0.6392649903288201,
219
+ "eval_loss": 0.19595815241336823,
220
+ "eval_runtime": 1825.3052,
221
+ "eval_samples_per_second": 0.566,
222
+ "step": 128
223
+ },
224
+ {
225
+ "epoch": 43.88,
226
+ "learning_rate": 0.0001,
227
+ "loss": 0.0191,
228
+ "step": 132
229
+ },
230
+ {
231
+ "epoch": 45.29,
232
+ "learning_rate": 0.0001,
233
+ "loss": 0.019,
234
+ "step": 136
235
+ },
236
+ {
237
+ "epoch": 46.59,
238
+ "learning_rate": 0.0001,
239
+ "loss": 0.0203,
240
+ "step": 140
241
+ },
242
+ {
243
+ "epoch": 47.88,
244
+ "learning_rate": 0.0001,
245
+ "loss": 0.0182,
246
+ "step": 144
247
+ },
248
+ {
249
+ "epoch": 49.29,
250
+ "learning_rate": 0.0001,
251
+ "loss": 0.0158,
252
+ "step": 148
253
+ },
254
+ {
255
+ "epoch": 50.59,
256
+ "learning_rate": 0.0001,
257
+ "loss": 0.0118,
258
+ "step": 152
259
+ },
260
+ {
261
+ "epoch": 51.88,
262
+ "learning_rate": 0.0001,
263
+ "loss": 0.0139,
264
+ "step": 156
265
+ },
266
+ {
267
+ "epoch": 53.29,
268
+ "learning_rate": 0.0001,
269
+ "loss": 0.016,
270
+ "step": 160
271
+ },
272
+ {
273
+ "epoch": 54.59,
274
+ "learning_rate": 0.0001,
275
+ "loss": 0.022,
276
+ "step": 164
277
+ },
278
+ {
279
+ "epoch": 55.88,
280
+ "learning_rate": 0.0001,
281
+ "loss": 0.0184,
282
+ "step": 168
283
+ },
284
+ {
285
+ "epoch": 57.29,
286
+ "learning_rate": 0.0001,
287
+ "loss": 0.0102,
288
+ "step": 172
289
+ },
290
+ {
291
+ "epoch": 58.59,
292
+ "learning_rate": 0.0001,
293
+ "loss": 0.0096,
294
+ "step": 176
295
+ },
296
+ {
297
+ "epoch": 59.88,
298
+ "learning_rate": 0.0001,
299
+ "loss": 0.0093,
300
+ "step": 180
301
+ },
302
+ {
303
+ "epoch": 61.29,
304
+ "learning_rate": 0.0001,
305
+ "loss": 0.0096,
306
+ "step": 184
307
+ },
308
+ {
309
+ "epoch": 62.59,
310
+ "learning_rate": 0.0001,
311
+ "loss": 0.0075,
312
+ "step": 188
313
+ },
314
+ {
315
+ "epoch": 63.88,
316
+ "learning_rate": 0.0001,
317
+ "loss": 0.0072,
318
+ "step": 192
319
+ },
320
+ {
321
+ "epoch": 63.88,
322
+ "eval_exact_match": 0.6460348162475822,
323
+ "eval_loss": 0.24530905485153198,
324
+ "eval_runtime": 1845.9477,
325
+ "eval_samples_per_second": 0.56,
326
+ "step": 192
327
+ },
328
+ {
329
+ "epoch": 65.29,
330
+ "learning_rate": 0.0001,
331
+ "loss": 0.007,
332
+ "step": 196
333
+ },
334
+ {
335
+ "epoch": 66.59,
336
+ "learning_rate": 0.0001,
337
+ "loss": 0.0091,
338
+ "step": 200
339
+ },
340
+ {
341
+ "epoch": 67.88,
342
+ "learning_rate": 0.0001,
343
+ "loss": 0.0082,
344
+ "step": 204
345
+ },
346
+ {
347
+ "epoch": 69.29,
348
+ "learning_rate": 0.0001,
349
+ "loss": 0.0063,
350
+ "step": 208
351
+ },
352
+ {
353
+ "epoch": 70.59,
354
+ "learning_rate": 0.0001,
355
+ "loss": 0.0447,
356
+ "step": 212
357
+ },
358
+ {
359
+ "epoch": 71.88,
360
+ "learning_rate": 0.0001,
361
+ "loss": 0.0114,
362
+ "step": 216
363
+ },
364
+ {
365
+ "epoch": 73.29,
366
+ "learning_rate": 0.0001,
367
+ "loss": 0.0065,
368
+ "step": 220
369
+ },
370
+ {
371
+ "epoch": 74.59,
372
+ "learning_rate": 0.0001,
373
+ "loss": 0.0046,
374
+ "step": 224
375
+ },
376
+ {
377
+ "epoch": 75.88,
378
+ "learning_rate": 0.0001,
379
+ "loss": 0.0054,
380
+ "step": 228
381
+ },
382
+ {
383
+ "epoch": 77.29,
384
+ "learning_rate": 0.0001,
385
+ "loss": 0.0048,
386
+ "step": 232
387
+ },
388
+ {
389
+ "epoch": 78.59,
390
+ "learning_rate": 0.0001,
391
+ "loss": 0.0051,
392
+ "step": 236
393
+ },
394
+ {
395
+ "epoch": 79.88,
396
+ "learning_rate": 0.0001,
397
+ "loss": 0.0041,
398
+ "step": 240
399
+ },
400
+ {
401
+ "epoch": 81.29,
402
+ "learning_rate": 0.0001,
403
+ "loss": 0.0044,
404
+ "step": 244
405
+ },
406
+ {
407
+ "epoch": 82.59,
408
+ "learning_rate": 0.0001,
409
+ "loss": 0.0222,
410
+ "step": 248
411
+ },
412
+ {
413
+ "epoch": 83.88,
414
+ "learning_rate": 0.0001,
415
+ "loss": 0.0049,
416
+ "step": 252
417
+ },
418
+ {
419
+ "epoch": 85.29,
420
+ "learning_rate": 0.0001,
421
+ "loss": 0.0632,
422
+ "step": 256
423
+ },
424
+ {
425
+ "epoch": 85.29,
426
+ "eval_exact_match": 0.648936170212766,
427
+ "eval_loss": 0.21414929628372192,
428
+ "eval_runtime": 1740.439,
429
+ "eval_samples_per_second": 0.594,
430
+ "step": 256
431
+ },
432
+ {
433
+ "epoch": 86.59,
434
+ "learning_rate": 0.0001,
435
+ "loss": 0.0277,
436
+ "step": 260
437
+ },
438
+ {
439
+ "epoch": 87.88,
440
+ "learning_rate": 0.0001,
441
+ "loss": 0.0074,
442
+ "step": 264
443
+ },
444
+ {
445
+ "epoch": 89.29,
446
+ "learning_rate": 0.0001,
447
+ "loss": 0.0041,
448
+ "step": 268
449
+ },
450
+ {
451
+ "epoch": 90.59,
452
+ "learning_rate": 0.0001,
453
+ "loss": 0.0027,
454
+ "step": 272
455
+ },
456
+ {
457
+ "epoch": 91.88,
458
+ "learning_rate": 0.0001,
459
+ "loss": 0.0026,
460
+ "step": 276
461
+ },
462
+ {
463
+ "epoch": 93.29,
464
+ "learning_rate": 0.0001,
465
+ "loss": 0.0024,
466
+ "step": 280
467
+ },
468
+ {
469
+ "epoch": 94.59,
470
+ "learning_rate": 0.0001,
471
+ "loss": 0.0021,
472
+ "step": 284
473
+ },
474
+ {
475
+ "epoch": 95.88,
476
+ "learning_rate": 0.0001,
477
+ "loss": 0.002,
478
+ "step": 288
479
+ },
480
+ {
481
+ "epoch": 97.29,
482
+ "learning_rate": 0.0001,
483
+ "loss": 0.002,
484
+ "step": 292
485
+ },
486
+ {
487
+ "epoch": 98.59,
488
+ "learning_rate": 0.0001,
489
+ "loss": 0.0018,
490
+ "step": 296
491
+ },
492
+ {
493
+ "epoch": 99.88,
494
+ "learning_rate": 0.0001,
495
+ "loss": 0.0017,
496
+ "step": 300
497
+ },
498
+ {
499
+ "epoch": 101.29,
500
+ "learning_rate": 0.0001,
501
+ "loss": 0.0019,
502
+ "step": 304
503
+ },
504
+ {
505
+ "epoch": 102.59,
506
+ "learning_rate": 0.0001,
507
+ "loss": 0.0017,
508
+ "step": 308
509
+ },
510
+ {
511
+ "epoch": 103.88,
512
+ "learning_rate": 0.0001,
513
+ "loss": 0.0016,
514
+ "step": 312
515
+ },
516
+ {
517
+ "epoch": 105.29,
518
+ "learning_rate": 0.0001,
519
+ "loss": 0.0018,
520
+ "step": 316
521
+ },
522
+ {
523
+ "epoch": 106.59,
524
+ "learning_rate": 0.0001,
525
+ "loss": 0.0015,
526
+ "step": 320
527
+ },
528
+ {
529
+ "epoch": 106.59,
530
+ "eval_exact_match": 0.6808510638297872,
531
+ "eval_loss": 0.2944816052913666,
532
+ "eval_runtime": 1854.2635,
533
+ "eval_samples_per_second": 0.558,
534
+ "step": 320
535
+ },
536
+ {
537
+ "epoch": 107.88,
538
+ "learning_rate": 0.0001,
539
+ "loss": 0.0013,
540
+ "step": 324
541
+ },
542
+ {
543
+ "epoch": 109.29,
544
+ "learning_rate": 0.0001,
545
+ "loss": 0.0014,
546
+ "step": 328
547
+ },
548
+ {
549
+ "epoch": 110.59,
550
+ "learning_rate": 0.0001,
551
+ "loss": 0.0017,
552
+ "step": 332
553
+ },
554
+ {
555
+ "epoch": 111.88,
556
+ "learning_rate": 0.0001,
557
+ "loss": 0.0013,
558
+ "step": 336
559
+ },
560
+ {
561
+ "epoch": 113.29,
562
+ "learning_rate": 0.0001,
563
+ "loss": 0.0013,
564
+ "step": 340
565
+ },
566
+ {
567
+ "epoch": 114.59,
568
+ "learning_rate": 0.0001,
569
+ "loss": 0.0011,
570
+ "step": 344
571
+ },
572
+ {
573
+ "epoch": 115.88,
574
+ "learning_rate": 0.0001,
575
+ "loss": 0.0015,
576
+ "step": 348
577
+ },
578
+ {
579
+ "epoch": 117.29,
580
+ "learning_rate": 0.0001,
581
+ "loss": 0.0021,
582
+ "step": 352
583
+ },
584
+ {
585
+ "epoch": 118.59,
586
+ "learning_rate": 0.0001,
587
+ "loss": 0.0013,
588
+ "step": 356
589
+ },
590
+ {
591
+ "epoch": 119.88,
592
+ "learning_rate": 0.0001,
593
+ "loss": 0.001,
594
+ "step": 360
595
+ },
596
+ {
597
+ "epoch": 121.29,
598
+ "learning_rate": 0.0001,
599
+ "loss": 0.0012,
600
+ "step": 364
601
+ },
602
+ {
603
+ "epoch": 122.59,
604
+ "learning_rate": 0.0001,
605
+ "loss": 0.001,
606
+ "step": 368
607
+ },
608
+ {
609
+ "epoch": 123.88,
610
+ "learning_rate": 0.0001,
611
+ "loss": 0.0009,
612
+ "step": 372
613
+ },
614
+ {
615
+ "epoch": 125.29,
616
+ "learning_rate": 0.0001,
617
+ "loss": 0.0013,
618
+ "step": 376
619
+ },
620
+ {
621
+ "epoch": 126.59,
622
+ "learning_rate": 0.0001,
623
+ "loss": 0.0012,
624
+ "step": 380
625
+ },
626
+ {
627
+ "epoch": 127.88,
628
+ "learning_rate": 0.0001,
629
+ "loss": 0.0009,
630
+ "step": 384
631
+ },
632
+ {
633
+ "epoch": 127.88,
634
+ "eval_exact_match": 0.6731141199226306,
635
+ "eval_loss": 0.31063422560691833,
636
+ "eval_runtime": 1905.6919,
637
+ "eval_samples_per_second": 0.543,
638
+ "step": 384
639
+ },
640
+ {
641
+ "epoch": 129.29,
642
+ "learning_rate": 0.0001,
643
+ "loss": 0.0012,
644
+ "step": 388
645
+ },
646
+ {
647
+ "epoch": 130.59,
648
+ "learning_rate": 0.0001,
649
+ "loss": 0.001,
650
+ "step": 392
651
+ },
652
+ {
653
+ "epoch": 131.88,
654
+ "learning_rate": 0.0001,
655
+ "loss": 0.0008,
656
+ "step": 396
657
+ },
658
+ {
659
+ "epoch": 133.29,
660
+ "learning_rate": 0.0001,
661
+ "loss": 0.0011,
662
+ "step": 400
663
+ },
664
+ {
665
+ "epoch": 134.59,
666
+ "learning_rate": 0.0001,
667
+ "loss": 0.0009,
668
+ "step": 404
669
+ },
670
+ {
671
+ "epoch": 135.88,
672
+ "learning_rate": 0.0001,
673
+ "loss": 0.0009,
674
+ "step": 408
675
+ },
676
+ {
677
+ "epoch": 137.29,
678
+ "learning_rate": 0.0001,
679
+ "loss": 0.0011,
680
+ "step": 412
681
+ },
682
+ {
683
+ "epoch": 138.59,
684
+ "learning_rate": 0.0001,
685
+ "loss": 0.0009,
686
+ "step": 416
687
+ },
688
+ {
689
+ "epoch": 139.88,
690
+ "learning_rate": 0.0001,
691
+ "loss": 0.001,
692
+ "step": 420
693
+ },
694
+ {
695
+ "epoch": 141.29,
696
+ "learning_rate": 0.0001,
697
+ "loss": 0.001,
698
+ "step": 424
699
+ },
700
+ {
701
+ "epoch": 142.59,
702
+ "learning_rate": 0.0001,
703
+ "loss": 0.0007,
704
+ "step": 428
705
+ },
706
+ {
707
+ "epoch": 143.88,
708
+ "learning_rate": 0.0001,
709
+ "loss": 0.0008,
710
+ "step": 432
711
+ },
712
+ {
713
+ "epoch": 145.29,
714
+ "learning_rate": 0.0001,
715
+ "loss": 0.001,
716
+ "step": 436
717
+ },
718
+ {
719
+ "epoch": 146.59,
720
+ "learning_rate": 0.0001,
721
+ "loss": 0.0009,
722
+ "step": 440
723
+ },
724
+ {
725
+ "epoch": 147.88,
726
+ "learning_rate": 0.0001,
727
+ "loss": 0.0009,
728
+ "step": 444
729
+ },
730
+ {
731
+ "epoch": 149.29,
732
+ "learning_rate": 0.0001,
733
+ "loss": 0.001,
734
+ "step": 448
735
+ },
736
+ {
737
+ "epoch": 149.29,
738
+ "eval_exact_match": 0.6847195357833655,
739
+ "eval_loss": 0.32885223627090454,
740
+ "eval_runtime": 1904.5416,
741
+ "eval_samples_per_second": 0.543,
742
+ "step": 448
743
+ },
744
+ {
745
+ "epoch": 150.59,
746
+ "learning_rate": 0.0001,
747
+ "loss": 0.0012,
748
+ "step": 452
749
+ },
750
+ {
751
+ "epoch": 151.88,
752
+ "learning_rate": 0.0001,
753
+ "loss": 0.0013,
754
+ "step": 456
755
+ },
756
+ {
757
+ "epoch": 153.29,
758
+ "learning_rate": 0.0001,
759
+ "loss": 0.001,
760
+ "step": 460
761
+ },
762
+ {
763
+ "epoch": 154.59,
764
+ "learning_rate": 0.0001,
765
+ "loss": 0.0008,
766
+ "step": 464
767
+ },
768
+ {
769
+ "epoch": 155.88,
770
+ "learning_rate": 0.0001,
771
+ "loss": 0.0006,
772
+ "step": 468
773
+ },
774
+ {
775
+ "epoch": 157.29,
776
+ "learning_rate": 0.0001,
777
+ "loss": 0.0008,
778
+ "step": 472
779
+ },
780
+ {
781
+ "epoch": 158.59,
782
+ "learning_rate": 0.0001,
783
+ "loss": 0.0096,
784
+ "step": 476
785
+ },
786
+ {
787
+ "epoch": 159.88,
788
+ "learning_rate": 0.0001,
789
+ "loss": 0.002,
790
+ "step": 480
791
+ },
792
+ {
793
+ "epoch": 161.29,
794
+ "learning_rate": 0.0001,
795
+ "loss": 0.0009,
796
+ "step": 484
797
+ },
798
+ {
799
+ "epoch": 162.59,
800
+ "learning_rate": 0.0001,
801
+ "loss": 0.0007,
802
+ "step": 488
803
+ },
804
+ {
805
+ "epoch": 163.88,
806
+ "learning_rate": 0.0001,
807
+ "loss": 0.0007,
808
+ "step": 492
809
+ },
810
+ {
811
+ "epoch": 165.29,
812
+ "learning_rate": 0.0001,
813
+ "loss": 0.0009,
814
+ "step": 496
815
+ },
816
+ {
817
+ "epoch": 166.59,
818
+ "learning_rate": 0.0001,
819
+ "loss": 0.0007,
820
+ "step": 500
821
+ },
822
+ {
823
+ "epoch": 167.88,
824
+ "learning_rate": 0.0001,
825
+ "loss": 0.0009,
826
+ "step": 504
827
+ },
828
+ {
829
+ "epoch": 169.29,
830
+ "learning_rate": 0.0001,
831
+ "loss": 0.0148,
832
+ "step": 508
833
+ },
834
+ {
835
+ "epoch": 170.59,
836
+ "learning_rate": 0.0001,
837
+ "loss": 0.0127,
838
+ "step": 512
839
+ },
840
+ {
841
+ "epoch": 170.59,
842
+ "eval_exact_match": 0.6740812379110251,
843
+ "eval_loss": 0.22413159906864166,
844
+ "eval_runtime": 1746.478,
845
+ "eval_samples_per_second": 0.592,
846
+ "step": 512
847
+ },
848
+ {
849
+ "epoch": 171.88,
850
+ "learning_rate": 0.0001,
851
+ "loss": 0.0023,
852
+ "step": 516
853
+ },
854
+ {
855
+ "epoch": 173.29,
856
+ "learning_rate": 0.0001,
857
+ "loss": 0.001,
858
+ "step": 520
859
+ },
860
+ {
861
+ "epoch": 174.59,
862
+ "learning_rate": 0.0001,
863
+ "loss": 0.0007,
864
+ "step": 524
865
+ },
866
+ {
867
+ "epoch": 175.88,
868
+ "learning_rate": 0.0001,
869
+ "loss": 0.0009,
870
+ "step": 528
871
+ },
872
+ {
873
+ "epoch": 177.29,
874
+ "learning_rate": 0.0001,
875
+ "loss": 0.001,
876
+ "step": 532
877
+ },
878
+ {
879
+ "epoch": 178.59,
880
+ "learning_rate": 0.0001,
881
+ "loss": 0.0006,
882
+ "step": 536
883
+ },
884
+ {
885
+ "epoch": 179.88,
886
+ "learning_rate": 0.0001,
887
+ "loss": 0.0044,
888
+ "step": 540
889
+ },
890
+ {
891
+ "epoch": 181.29,
892
+ "learning_rate": 0.0001,
893
+ "loss": 0.0013,
894
+ "step": 544
895
+ },
896
+ {
897
+ "epoch": 182.59,
898
+ "learning_rate": 0.0001,
899
+ "loss": 0.0009,
900
+ "step": 548
901
+ },
902
+ {
903
+ "epoch": 183.88,
904
+ "learning_rate": 0.0001,
905
+ "loss": 0.0006,
906
+ "step": 552
907
+ },
908
+ {
909
+ "epoch": 185.29,
910
+ "learning_rate": 0.0001,
911
+ "loss": 0.0007,
912
+ "step": 556
913
+ },
914
+ {
915
+ "epoch": 186.59,
916
+ "learning_rate": 0.0001,
917
+ "loss": 0.0005,
918
+ "step": 560
919
+ },
920
+ {
921
+ "epoch": 187.88,
922
+ "learning_rate": 0.0001,
923
+ "loss": 0.0007,
924
+ "step": 564
925
+ },
926
+ {
927
+ "epoch": 189.29,
928
+ "learning_rate": 0.0001,
929
+ "loss": 0.0009,
930
+ "step": 568
931
+ },
932
+ {
933
+ "epoch": 190.59,
934
+ "learning_rate": 0.0001,
935
+ "loss": 0.0005,
936
+ "step": 572
937
+ },
938
+ {
939
+ "epoch": 191.88,
940
+ "learning_rate": 0.0001,
941
+ "loss": 0.0013,
942
+ "step": 576
943
+ },
944
+ {
945
+ "epoch": 191.88,
946
+ "eval_exact_match": 0.6876208897485493,
947
+ "eval_loss": 0.3054460287094116,
948
+ "eval_runtime": 1863.2506,
949
+ "eval_samples_per_second": 0.555,
950
+ "step": 576
951
+ },
952
+ {
953
+ "epoch": 193.29,
954
+ "learning_rate": 0.0001,
955
+ "loss": 0.0006,
956
+ "step": 580
957
+ },
958
+ {
959
+ "epoch": 194.59,
960
+ "learning_rate": 0.0001,
961
+ "loss": 0.0005,
962
+ "step": 584
963
+ },
964
+ {
965
+ "epoch": 195.88,
966
+ "learning_rate": 0.0001,
967
+ "loss": 0.0007,
968
+ "step": 588
969
+ },
970
+ {
971
+ "epoch": 197.29,
972
+ "learning_rate": 0.0001,
973
+ "loss": 0.0069,
974
+ "step": 592
975
+ },
976
+ {
977
+ "epoch": 198.59,
978
+ "learning_rate": 0.0001,
979
+ "loss": 0.0008,
980
+ "step": 596
981
+ },
982
+ {
983
+ "epoch": 199.88,
984
+ "learning_rate": 0.0001,
985
+ "loss": 0.0005,
986
+ "step": 600
987
+ },
988
+ {
989
+ "epoch": 201.29,
990
+ "learning_rate": 0.0001,
991
+ "loss": 0.0023,
992
+ "step": 604
993
+ },
994
+ {
995
+ "epoch": 202.59,
996
+ "learning_rate": 0.0001,
997
+ "loss": 0.0024,
998
+ "step": 608
999
+ },
1000
+ {
1001
+ "epoch": 203.88,
1002
+ "learning_rate": 0.0001,
1003
+ "loss": 0.0253,
1004
+ "step": 612
1005
+ },
1006
+ {
1007
+ "epoch": 205.29,
1008
+ "learning_rate": 0.0001,
1009
+ "loss": 0.0083,
1010
+ "step": 616
1011
+ },
1012
+ {
1013
+ "epoch": 206.59,
1014
+ "learning_rate": 0.0001,
1015
+ "loss": 0.0009,
1016
+ "step": 620
1017
+ },
1018
+ {
1019
+ "epoch": 207.88,
1020
+ "learning_rate": 0.0001,
1021
+ "loss": 0.0007,
1022
+ "step": 624
1023
+ },
1024
+ {
1025
+ "epoch": 209.29,
1026
+ "learning_rate": 0.0001,
1027
+ "loss": 0.0006,
1028
+ "step": 628
1029
+ },
1030
+ {
1031
+ "epoch": 210.59,
1032
+ "learning_rate": 0.0001,
1033
+ "loss": 0.0004,
1034
+ "step": 632
1035
+ },
1036
+ {
1037
+ "epoch": 211.88,
1038
+ "learning_rate": 0.0001,
1039
+ "loss": 0.0005,
1040
+ "step": 636
1041
+ },
1042
+ {
1043
+ "epoch": 213.29,
1044
+ "learning_rate": 0.0001,
1045
+ "loss": 0.0019,
1046
+ "step": 640
1047
+ },
1048
+ {
1049
+ "epoch": 213.29,
1050
+ "eval_exact_match": 0.6382978723404256,
1051
+ "eval_loss": 0.3269137442111969,
1052
+ "eval_runtime": 2353.6231,
1053
+ "eval_samples_per_second": 0.439,
1054
+ "step": 640
1055
+ },
1056
+ {
1057
+ "epoch": 214.59,
1058
+ "learning_rate": 0.0001,
1059
+ "loss": 0.0095,
1060
+ "step": 644
1061
+ },
1062
+ {
1063
+ "epoch": 215.88,
1064
+ "learning_rate": 0.0001,
1065
+ "loss": 0.0007,
1066
+ "step": 648
1067
+ },
1068
+ {
1069
+ "epoch": 217.29,
1070
+ "learning_rate": 0.0001,
1071
+ "loss": 0.0007,
1072
+ "step": 652
1073
+ },
1074
+ {
1075
+ "epoch": 218.59,
1076
+ "learning_rate": 0.0001,
1077
+ "loss": 0.0006,
1078
+ "step": 656
1079
+ },
1080
+ {
1081
+ "epoch": 219.88,
1082
+ "learning_rate": 0.0001,
1083
+ "loss": 0.0004,
1084
+ "step": 660
1085
+ },
1086
+ {
1087
+ "epoch": 221.29,
1088
+ "learning_rate": 0.0001,
1089
+ "loss": 0.0004,
1090
+ "step": 664
1091
+ },
1092
+ {
1093
+ "epoch": 222.59,
1094
+ "learning_rate": 0.0001,
1095
+ "loss": 0.0004,
1096
+ "step": 668
1097
+ },
1098
+ {
1099
+ "epoch": 223.88,
1100
+ "learning_rate": 0.0001,
1101
+ "loss": 0.001,
1102
+ "step": 672
1103
+ },
1104
+ {
1105
+ "epoch": 225.29,
1106
+ "learning_rate": 0.0001,
1107
+ "loss": 0.0005,
1108
+ "step": 676
1109
+ },
1110
+ {
1111
+ "epoch": 226.59,
1112
+ "learning_rate": 0.0001,
1113
+ "loss": 0.0004,
1114
+ "step": 680
1115
+ },
1116
+ {
1117
+ "epoch": 227.88,
1118
+ "learning_rate": 0.0001,
1119
+ "loss": 0.0003,
1120
+ "step": 684
1121
+ },
1122
+ {
1123
+ "epoch": 229.29,
1124
+ "learning_rate": 0.0001,
1125
+ "loss": 0.0003,
1126
+ "step": 688
1127
+ },
1128
+ {
1129
+ "epoch": 230.59,
1130
+ "learning_rate": 0.0001,
1131
+ "loss": 0.0003,
1132
+ "step": 692
1133
+ },
1134
+ {
1135
+ "epoch": 231.88,
1136
+ "learning_rate": 0.0001,
1137
+ "loss": 0.0005,
1138
+ "step": 696
1139
+ },
1140
+ {
1141
+ "epoch": 233.29,
1142
+ "learning_rate": 0.0001,
1143
+ "loss": 0.0005,
1144
+ "step": 700
1145
+ },
1146
+ {
1147
+ "epoch": 234.59,
1148
+ "learning_rate": 0.0001,
1149
+ "loss": 0.0101,
1150
+ "step": 704
1151
+ },
1152
+ {
1153
+ "epoch": 234.59,
1154
+ "eval_exact_match": 0.6769825918762089,
1155
+ "eval_loss": 0.28256723284721375,
1156
+ "eval_runtime": 1983.1098,
1157
+ "eval_samples_per_second": 0.521,
1158
+ "step": 704
1159
+ },
1160
+ {
1161
+ "epoch": 235.88,
1162
+ "learning_rate": 0.0001,
1163
+ "loss": 0.005,
1164
+ "step": 708
1165
+ },
1166
+ {
1167
+ "epoch": 237.29,
1168
+ "learning_rate": 0.0001,
1169
+ "loss": 0.0025,
1170
+ "step": 712
1171
+ },
1172
+ {
1173
+ "epoch": 238.59,
1174
+ "learning_rate": 0.0001,
1175
+ "loss": 0.0009,
1176
+ "step": 716
1177
+ },
1178
+ {
1179
+ "epoch": 239.88,
1180
+ "learning_rate": 0.0001,
1181
+ "loss": 0.0007,
1182
+ "step": 720
1183
+ },
1184
+ {
1185
+ "epoch": 241.29,
1186
+ "learning_rate": 0.0001,
1187
+ "loss": 0.0007,
1188
+ "step": 724
1189
+ },
1190
+ {
1191
+ "epoch": 242.59,
1192
+ "learning_rate": 0.0001,
1193
+ "loss": 0.0004,
1194
+ "step": 728
1195
+ },
1196
+ {
1197
+ "epoch": 243.88,
1198
+ "learning_rate": 0.0001,
1199
+ "loss": 0.0003,
1200
+ "step": 732
1201
+ },
1202
+ {
1203
+ "epoch": 245.29,
1204
+ "learning_rate": 0.0001,
1205
+ "loss": 0.0111,
1206
+ "step": 736
1207
+ },
1208
+ {
1209
+ "epoch": 246.59,
1210
+ "learning_rate": 0.0001,
1211
+ "loss": 0.0112,
1212
+ "step": 740
1213
+ },
1214
+ {
1215
+ "epoch": 247.88,
1216
+ "learning_rate": 0.0001,
1217
+ "loss": 0.0026,
1218
+ "step": 744
1219
+ },
1220
+ {
1221
+ "epoch": 249.29,
1222
+ "learning_rate": 0.0001,
1223
+ "loss": 0.0006,
1224
+ "step": 748
1225
+ },
1226
+ {
1227
+ "epoch": 250.59,
1228
+ "learning_rate": 0.0001,
1229
+ "loss": 0.0004,
1230
+ "step": 752
1231
+ },
1232
+ {
1233
+ "epoch": 251.88,
1234
+ "learning_rate": 0.0001,
1235
+ "loss": 0.0004,
1236
+ "step": 756
1237
+ },
1238
+ {
1239
+ "epoch": 253.29,
1240
+ "learning_rate": 0.0001,
1241
+ "loss": 0.0004,
1242
+ "step": 760
1243
+ },
1244
+ {
1245
+ "epoch": 254.59,
1246
+ "learning_rate": 0.0001,
1247
+ "loss": 0.0004,
1248
+ "step": 764
1249
+ },
1250
+ {
1251
+ "epoch": 255.88,
1252
+ "learning_rate": 0.0001,
1253
+ "loss": 0.0003,
1254
+ "step": 768
1255
+ },
1256
+ {
1257
+ "epoch": 255.88,
1258
+ "eval_exact_match": 0.6982591876208898,
1259
+ "eval_loss": 0.3340039849281311,
1260
+ "eval_runtime": 1918.3137,
1261
+ "eval_samples_per_second": 0.539,
1262
+ "step": 768
1263
+ },
1264
+ {
1265
+ "epoch": 257.29,
1266
+ "learning_rate": 0.0001,
1267
+ "loss": 0.0003,
1268
+ "step": 772
1269
+ },
1270
+ {
1271
+ "epoch": 258.59,
1272
+ "learning_rate": 0.0001,
1273
+ "loss": 0.0004,
1274
+ "step": 776
1275
+ },
1276
+ {
1277
+ "epoch": 259.88,
1278
+ "learning_rate": 0.0001,
1279
+ "loss": 0.0005,
1280
+ "step": 780
1281
+ },
1282
+ {
1283
+ "epoch": 261.29,
1284
+ "learning_rate": 0.0001,
1285
+ "loss": 0.0003,
1286
+ "step": 784
1287
+ },
1288
+ {
1289
+ "epoch": 262.59,
1290
+ "learning_rate": 0.0001,
1291
+ "loss": 0.0004,
1292
+ "step": 788
1293
+ },
1294
+ {
1295
+ "epoch": 263.88,
1296
+ "learning_rate": 0.0001,
1297
+ "loss": 0.0003,
1298
+ "step": 792
1299
+ },
1300
+ {
1301
+ "epoch": 265.29,
1302
+ "learning_rate": 0.0001,
1303
+ "loss": 0.0004,
1304
+ "step": 796
1305
+ },
1306
+ {
1307
+ "epoch": 266.59,
1308
+ "learning_rate": 0.0001,
1309
+ "loss": 0.0003,
1310
+ "step": 800
1311
+ },
1312
+ {
1313
+ "epoch": 267.88,
1314
+ "learning_rate": 0.0001,
1315
+ "loss": 0.0003,
1316
+ "step": 804
1317
+ },
1318
+ {
1319
+ "epoch": 269.29,
1320
+ "learning_rate": 0.0001,
1321
+ "loss": 0.0002,
1322
+ "step": 808
1323
+ },
1324
+ {
1325
+ "epoch": 270.59,
1326
+ "learning_rate": 0.0001,
1327
+ "loss": 0.0003,
1328
+ "step": 812
1329
+ },
1330
+ {
1331
+ "epoch": 271.88,
1332
+ "learning_rate": 0.0001,
1333
+ "loss": 0.0003,
1334
+ "step": 816
1335
+ },
1336
+ {
1337
+ "epoch": 273.29,
1338
+ "learning_rate": 0.0001,
1339
+ "loss": 0.0003,
1340
+ "step": 820
1341
+ },
1342
+ {
1343
+ "epoch": 274.59,
1344
+ "learning_rate": 0.0001,
1345
+ "loss": 0.0002,
1346
+ "step": 824
1347
+ },
1348
+ {
1349
+ "epoch": 275.88,
1350
+ "learning_rate": 0.0001,
1351
+ "loss": 0.0011,
1352
+ "step": 828
1353
+ },
1354
+ {
1355
+ "epoch": 277.29,
1356
+ "learning_rate": 0.0001,
1357
+ "loss": 0.0012,
1358
+ "step": 832
1359
+ },
1360
+ {
1361
+ "epoch": 277.29,
1362
+ "eval_exact_match": 0.6924564796905223,
1363
+ "eval_loss": 0.3206212818622589,
1364
+ "eval_runtime": 1930.7912,
1365
+ "eval_samples_per_second": 0.536,
1366
+ "step": 832
1367
+ },
1368
+ {
1369
+ "epoch": 278.59,
1370
+ "learning_rate": 0.0001,
1371
+ "loss": 0.0003,
1372
+ "step": 836
1373
+ },
1374
+ {
1375
+ "epoch": 279.88,
1376
+ "learning_rate": 0.0001,
1377
+ "loss": 0.0005,
1378
+ "step": 840
1379
+ },
1380
+ {
1381
+ "epoch": 281.29,
1382
+ "learning_rate": 0.0001,
1383
+ "loss": 0.0002,
1384
+ "step": 844
1385
+ },
1386
+ {
1387
+ "epoch": 282.59,
1388
+ "learning_rate": 0.0001,
1389
+ "loss": 0.0013,
1390
+ "step": 848
1391
+ },
1392
+ {
1393
+ "epoch": 283.88,
1394
+ "learning_rate": 0.0001,
1395
+ "loss": 0.0004,
1396
+ "step": 852
1397
+ },
1398
+ {
1399
+ "epoch": 285.29,
1400
+ "learning_rate": 0.0001,
1401
+ "loss": 0.0004,
1402
+ "step": 856
1403
+ },
1404
+ {
1405
+ "epoch": 286.59,
1406
+ "learning_rate": 0.0001,
1407
+ "loss": 0.0007,
1408
+ "step": 860
1409
+ },
1410
+ {
1411
+ "epoch": 287.88,
1412
+ "learning_rate": 0.0001,
1413
+ "loss": 0.0058,
1414
+ "step": 864
1415
+ },
1416
+ {
1417
+ "epoch": 289.29,
1418
+ "learning_rate": 0.0001,
1419
+ "loss": 0.0153,
1420
+ "step": 868
1421
+ },
1422
+ {
1423
+ "epoch": 290.59,
1424
+ "learning_rate": 0.0001,
1425
+ "loss": 0.004,
1426
+ "step": 872
1427
+ },
1428
+ {
1429
+ "epoch": 291.88,
1430
+ "learning_rate": 0.0001,
1431
+ "loss": 0.0004,
1432
+ "step": 876
1433
+ },
1434
+ {
1435
+ "epoch": 293.29,
1436
+ "learning_rate": 0.0001,
1437
+ "loss": 0.0003,
1438
+ "step": 880
1439
+ },
1440
+ {
1441
+ "epoch": 294.59,
1442
+ "learning_rate": 0.0001,
1443
+ "loss": 0.0002,
1444
+ "step": 884
1445
+ },
1446
+ {
1447
+ "epoch": 295.88,
1448
+ "learning_rate": 0.0001,
1449
+ "loss": 0.0005,
1450
+ "step": 888
1451
+ },
1452
+ {
1453
+ "epoch": 297.29,
1454
+ "learning_rate": 0.0001,
1455
+ "loss": 0.0004,
1456
+ "step": 892
1457
+ },
1458
+ {
1459
+ "epoch": 298.59,
1460
+ "learning_rate": 0.0001,
1461
+ "loss": 0.0003,
1462
+ "step": 896
1463
+ },
1464
+ {
1465
+ "epoch": 298.59,
1466
+ "eval_exact_match": 0.7030947775628626,
1467
+ "eval_loss": 0.3318493366241455,
1468
+ "eval_runtime": 1959.3529,
1469
+ "eval_samples_per_second": 0.528,
1470
+ "step": 896
1471
+ },
1472
+ {
1473
+ "epoch": 299.88,
1474
+ "learning_rate": 0.0001,
1475
+ "loss": 0.0002,
1476
+ "step": 900
1477
+ },
1478
+ {
1479
+ "epoch": 301.29,
1480
+ "learning_rate": 0.0001,
1481
+ "loss": 0.0002,
1482
+ "step": 904
1483
+ },
1484
+ {
1485
+ "epoch": 302.59,
1486
+ "learning_rate": 0.0001,
1487
+ "loss": 0.0002,
1488
+ "step": 908
1489
+ },
1490
+ {
1491
+ "epoch": 303.88,
1492
+ "learning_rate": 0.0001,
1493
+ "loss": 0.0002,
1494
+ "step": 912
1495
+ },
1496
+ {
1497
+ "epoch": 305.29,
1498
+ "learning_rate": 0.0001,
1499
+ "loss": 0.0003,
1500
+ "step": 916
1501
+ },
1502
+ {
1503
+ "epoch": 306.59,
1504
+ "learning_rate": 0.0001,
1505
+ "loss": 0.0002,
1506
+ "step": 920
1507
+ },
1508
+ {
1509
+ "epoch": 307.88,
1510
+ "learning_rate": 0.0001,
1511
+ "loss": 0.0002,
1512
+ "step": 924
1513
+ },
1514
+ {
1515
+ "epoch": 309.29,
1516
+ "learning_rate": 0.0001,
1517
+ "loss": 0.0002,
1518
+ "step": 928
1519
+ },
1520
+ {
1521
+ "epoch": 310.59,
1522
+ "learning_rate": 0.0001,
1523
+ "loss": 0.0002,
1524
+ "step": 932
1525
+ },
1526
+ {
1527
+ "epoch": 311.88,
1528
+ "learning_rate": 0.0001,
1529
+ "loss": 0.0004,
1530
+ "step": 936
1531
+ },
1532
+ {
1533
+ "epoch": 313.29,
1534
+ "learning_rate": 0.0001,
1535
+ "loss": 0.0002,
1536
+ "step": 940
1537
+ },
1538
+ {
1539
+ "epoch": 314.59,
1540
+ "learning_rate": 0.0001,
1541
+ "loss": 0.0004,
1542
+ "step": 944
1543
+ },
1544
+ {
1545
+ "epoch": 315.88,
1546
+ "learning_rate": 0.0001,
1547
+ "loss": 0.0005,
1548
+ "step": 948
1549
+ },
1550
+ {
1551
+ "epoch": 317.29,
1552
+ "learning_rate": 0.0001,
1553
+ "loss": 0.0002,
1554
+ "step": 952
1555
+ },
1556
+ {
1557
+ "epoch": 318.59,
1558
+ "learning_rate": 0.0001,
1559
+ "loss": 0.0003,
1560
+ "step": 956
1561
+ },
1562
+ {
1563
+ "epoch": 319.88,
1564
+ "learning_rate": 0.0001,
1565
+ "loss": 0.0002,
1566
+ "step": 960
1567
+ },
1568
+ {
1569
+ "epoch": 319.88,
1570
+ "eval_exact_match": 0.6972920696324951,
1571
+ "eval_loss": 0.34125566482543945,
1572
+ "eval_runtime": 1929.7453,
1573
+ "eval_samples_per_second": 0.536,
1574
+ "step": 960
1575
+ },
1576
+ {
1577
+ "epoch": 321.29,
1578
+ "learning_rate": 0.0001,
1579
+ "loss": 0.0002,
1580
+ "step": 964
1581
+ },
1582
+ {
1583
+ "epoch": 322.59,
1584
+ "learning_rate": 0.0001,
1585
+ "loss": 0.0002,
1586
+ "step": 968
1587
+ },
1588
+ {
1589
+ "epoch": 323.88,
1590
+ "learning_rate": 0.0001,
1591
+ "loss": 0.0002,
1592
+ "step": 972
1593
+ },
1594
+ {
1595
+ "epoch": 325.29,
1596
+ "learning_rate": 0.0001,
1597
+ "loss": 0.0001,
1598
+ "step": 976
1599
+ },
1600
+ {
1601
+ "epoch": 326.59,
1602
+ "learning_rate": 0.0001,
1603
+ "loss": 0.0002,
1604
+ "step": 980
1605
+ },
1606
+ {
1607
+ "epoch": 327.88,
1608
+ "learning_rate": 0.0001,
1609
+ "loss": 0.0002,
1610
+ "step": 984
1611
+ },
1612
+ {
1613
+ "epoch": 329.29,
1614
+ "learning_rate": 0.0001,
1615
+ "loss": 0.0002,
1616
+ "step": 988
1617
+ },
1618
+ {
1619
+ "epoch": 330.59,
1620
+ "learning_rate": 0.0001,
1621
+ "loss": 0.0002,
1622
+ "step": 992
1623
+ },
1624
+ {
1625
+ "epoch": 331.88,
1626
+ "learning_rate": 0.0001,
1627
+ "loss": 0.0006,
1628
+ "step": 996
1629
+ },
1630
+ {
1631
+ "epoch": 333.29,
1632
+ "learning_rate": 0.0001,
1633
+ "loss": 0.015,
1634
+ "step": 1000
1635
+ },
1636
+ {
1637
+ "epoch": 334.59,
1638
+ "learning_rate": 0.0001,
1639
+ "loss": 0.0006,
1640
+ "step": 1004
1641
+ },
1642
+ {
1643
+ "epoch": 335.88,
1644
+ "learning_rate": 0.0001,
1645
+ "loss": 0.0003,
1646
+ "step": 1008
1647
+ },
1648
+ {
1649
+ "epoch": 337.29,
1650
+ "learning_rate": 0.0001,
1651
+ "loss": 0.0003,
1652
+ "step": 1012
1653
+ },
1654
+ {
1655
+ "epoch": 338.59,
1656
+ "learning_rate": 0.0001,
1657
+ "loss": 0.0002,
1658
+ "step": 1016
1659
+ },
1660
+ {
1661
+ "epoch": 339.88,
1662
+ "learning_rate": 0.0001,
1663
+ "loss": 0.0002,
1664
+ "step": 1020
1665
+ },
1666
+ {
1667
+ "epoch": 341.29,
1668
+ "learning_rate": 0.0001,
1669
+ "loss": 0.0002,
1670
+ "step": 1024
1671
+ },
1672
+ {
1673
+ "epoch": 341.29,
1674
+ "eval_exact_match": 0.6963249516441006,
1675
+ "eval_loss": 0.3414294123649597,
1676
+ "eval_runtime": 1925.945,
1677
+ "eval_samples_per_second": 0.537,
1678
+ "step": 1024
1679
+ },
1680
+ {
1681
+ "epoch": 342.59,
1682
+ "learning_rate": 0.0001,
1683
+ "loss": 0.0001,
1684
+ "step": 1028
1685
+ },
1686
+ {
1687
+ "epoch": 343.88,
1688
+ "learning_rate": 0.0001,
1689
+ "loss": 0.0002,
1690
+ "step": 1032
1691
+ },
1692
+ {
1693
+ "epoch": 345.29,
1694
+ "learning_rate": 0.0001,
1695
+ "loss": 0.0002,
1696
+ "step": 1036
1697
+ },
1698
+ {
1699
+ "epoch": 346.59,
1700
+ "learning_rate": 0.0001,
1701
+ "loss": 0.0001,
1702
+ "step": 1040
1703
+ },
1704
+ {
1705
+ "epoch": 347.88,
1706
+ "learning_rate": 0.0001,
1707
+ "loss": 0.0002,
1708
+ "step": 1044
1709
+ },
1710
+ {
1711
+ "epoch": 349.29,
1712
+ "learning_rate": 0.0001,
1713
+ "loss": 0.0001,
1714
+ "step": 1048
1715
+ },
1716
+ {
1717
+ "epoch": 350.59,
1718
+ "learning_rate": 0.0001,
1719
+ "loss": 0.0001,
1720
+ "step": 1052
1721
+ },
1722
+ {
1723
+ "epoch": 351.88,
1724
+ "learning_rate": 0.0001,
1725
+ "loss": 0.0001,
1726
+ "step": 1056
1727
+ },
1728
+ {
1729
+ "epoch": 353.29,
1730
+ "learning_rate": 0.0001,
1731
+ "loss": 0.0001,
1732
+ "step": 1060
1733
+ },
1734
+ {
1735
+ "epoch": 354.59,
1736
+ "learning_rate": 0.0001,
1737
+ "loss": 0.0001,
1738
+ "step": 1064
1739
+ },
1740
+ {
1741
+ "epoch": 355.88,
1742
+ "learning_rate": 0.0001,
1743
+ "loss": 0.0002,
1744
+ "step": 1068
1745
+ },
1746
+ {
1747
+ "epoch": 357.29,
1748
+ "learning_rate": 0.0001,
1749
+ "loss": 0.0002,
1750
+ "step": 1072
1751
+ },
1752
+ {
1753
+ "epoch": 358.59,
1754
+ "learning_rate": 0.0001,
1755
+ "loss": 0.0001,
1756
+ "step": 1076
1757
+ },
1758
+ {
1759
+ "epoch": 359.88,
1760
+ "learning_rate": 0.0001,
1761
+ "loss": 0.0001,
1762
+ "step": 1080
1763
+ },
1764
+ {
1765
+ "epoch": 361.29,
1766
+ "learning_rate": 0.0001,
1767
+ "loss": 0.0001,
1768
+ "step": 1084
1769
+ },
1770
+ {
1771
+ "epoch": 362.59,
1772
+ "learning_rate": 0.0001,
1773
+ "loss": 0.0001,
1774
+ "step": 1088
1775
+ },
1776
+ {
1777
+ "epoch": 362.59,
1778
+ "eval_exact_match": 0.7127659574468085,
1779
+ "eval_loss": 0.3638148605823517,
1780
+ "eval_runtime": 1954.8096,
1781
+ "eval_samples_per_second": 0.529,
1782
+ "step": 1088
1783
+ }
1784
+ ],
1785
+ "max_steps": 9216,
1786
+ "num_train_epochs": 3072,
1787
+ "total_flos": 6.244604651315036e+17,
1788
+ "trial_name": null,
1789
+ "trial_params": null
1790
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148e84efa75b7224b12affd1f56279d07da53ee037cb008d67b67a7e7c81ca49
3
+ size 2607