potamides commited on
Commit
42eef04
1 Parent(s): 0de0ec9

add model files

Browse files
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/byt5-large",
3
+ "architectures": [
4
+ "ByGPT5LMHeadModel"
5
+ ],
6
+ "d_ff": 3840,
7
+ "d_kv": 64,
8
+ "d_model": 1536,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "gradient_checkpointing": false,
15
+ "initializer_factor": 1.0,
16
+ "is_decoder": true,
17
+ "is_encoder_decoder": false,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "bygpt5",
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 16,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "tie_word_embeddings": false,
29
+ "tokenizer_class": "ByT5Tokenizer",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.20.0",
32
+ "use_cache": true,
33
+ "vocab_size": 384
34
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bb3c3658f646c0da03e8e9c033ef2cfe55311ad448fbc59dc1fd8aaa94f888
3
+ size 1156247841
special_tokens_map.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>",
103
+ "<extra_id_100>",
104
+ "<extra_id_101>",
105
+ "<extra_id_102>",
106
+ "<extra_id_103>",
107
+ "<extra_id_104>",
108
+ "<extra_id_105>",
109
+ "<extra_id_106>",
110
+ "<extra_id_107>",
111
+ "<extra_id_108>",
112
+ "<extra_id_109>",
113
+ "<extra_id_110>",
114
+ "<extra_id_111>",
115
+ "<extra_id_112>",
116
+ "<extra_id_113>",
117
+ "<extra_id_114>",
118
+ "<extra_id_115>",
119
+ "<extra_id_116>",
120
+ "<extra_id_117>",
121
+ "<extra_id_118>",
122
+ "<extra_id_119>",
123
+ "<extra_id_120>",
124
+ "<extra_id_121>",
125
+ "<extra_id_122>",
126
+ "<extra_id_123>",
127
+ "<extra_id_124>"
128
+ ],
129
+ "eos_token": {
130
+ "content": "</s>",
131
+ "lstrip": false,
132
+ "normalized": true,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ "pad_token": {
137
+ "content": "<pad>",
138
+ "lstrip": false,
139
+ "normalized": true,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ },
143
+ "unk_token": {
144
+ "content": "<unk>",
145
+ "lstrip": false,
146
+ "normalized": true,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ }
150
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "additional_special_tokens": [
6
+ "<extra_id_0>",
7
+ "<extra_id_1>",
8
+ "<extra_id_2>",
9
+ "<extra_id_3>",
10
+ "<extra_id_4>",
11
+ "<extra_id_5>",
12
+ "<extra_id_6>",
13
+ "<extra_id_7>",
14
+ "<extra_id_8>",
15
+ "<extra_id_9>",
16
+ "<extra_id_10>",
17
+ "<extra_id_11>",
18
+ "<extra_id_12>",
19
+ "<extra_id_13>",
20
+ "<extra_id_14>",
21
+ "<extra_id_15>",
22
+ "<extra_id_16>",
23
+ "<extra_id_17>",
24
+ "<extra_id_18>",
25
+ "<extra_id_19>",
26
+ "<extra_id_20>",
27
+ "<extra_id_21>",
28
+ "<extra_id_22>",
29
+ "<extra_id_23>",
30
+ "<extra_id_24>",
31
+ "<extra_id_25>",
32
+ "<extra_id_26>",
33
+ "<extra_id_27>",
34
+ "<extra_id_28>",
35
+ "<extra_id_29>",
36
+ "<extra_id_30>",
37
+ "<extra_id_31>",
38
+ "<extra_id_32>",
39
+ "<extra_id_33>",
40
+ "<extra_id_34>",
41
+ "<extra_id_35>",
42
+ "<extra_id_36>",
43
+ "<extra_id_37>",
44
+ "<extra_id_38>",
45
+ "<extra_id_39>",
46
+ "<extra_id_40>",
47
+ "<extra_id_41>",
48
+ "<extra_id_42>",
49
+ "<extra_id_43>",
50
+ "<extra_id_44>",
51
+ "<extra_id_45>",
52
+ "<extra_id_46>",
53
+ "<extra_id_47>",
54
+ "<extra_id_48>",
55
+ "<extra_id_49>",
56
+ "<extra_id_50>",
57
+ "<extra_id_51>",
58
+ "<extra_id_52>",
59
+ "<extra_id_53>",
60
+ "<extra_id_54>",
61
+ "<extra_id_55>",
62
+ "<extra_id_56>",
63
+ "<extra_id_57>",
64
+ "<extra_id_58>",
65
+ "<extra_id_59>",
66
+ "<extra_id_60>",
67
+ "<extra_id_61>",
68
+ "<extra_id_62>",
69
+ "<extra_id_63>",
70
+ "<extra_id_64>",
71
+ "<extra_id_65>",
72
+ "<extra_id_66>",
73
+ "<extra_id_67>",
74
+ "<extra_id_68>",
75
+ "<extra_id_69>",
76
+ "<extra_id_70>",
77
+ "<extra_id_71>",
78
+ "<extra_id_72>",
79
+ "<extra_id_73>",
80
+ "<extra_id_74>",
81
+ "<extra_id_75>",
82
+ "<extra_id_76>",
83
+ "<extra_id_77>",
84
+ "<extra_id_78>",
85
+ "<extra_id_79>",
86
+ "<extra_id_80>",
87
+ "<extra_id_81>",
88
+ "<extra_id_82>",
89
+ "<extra_id_83>",
90
+ "<extra_id_84>",
91
+ "<extra_id_85>",
92
+ "<extra_id_86>",
93
+ "<extra_id_87>",
94
+ "<extra_id_88>",
95
+ "<extra_id_89>",
96
+ "<extra_id_90>",
97
+ "<extra_id_91>",
98
+ "<extra_id_92>",
99
+ "<extra_id_93>",
100
+ "<extra_id_94>",
101
+ "<extra_id_95>",
102
+ "<extra_id_96>",
103
+ "<extra_id_97>",
104
+ "<extra_id_98>",
105
+ "<extra_id_99>",
106
+ "<extra_id_100>",
107
+ "<extra_id_101>",
108
+ "<extra_id_102>",
109
+ "<extra_id_103>",
110
+ "<extra_id_104>",
111
+ "<extra_id_105>",
112
+ "<extra_id_106>",
113
+ "<extra_id_107>",
114
+ "<extra_id_108>",
115
+ "<extra_id_109>",
116
+ "<extra_id_110>",
117
+ "<extra_id_111>",
118
+ "<extra_id_112>",
119
+ "<extra_id_113>",
120
+ "<extra_id_114>",
121
+ "<extra_id_115>",
122
+ "<extra_id_116>",
123
+ "<extra_id_117>",
124
+ "<extra_id_118>",
125
+ "<extra_id_119>",
126
+ "<extra_id_120>",
127
+ "<extra_id_121>",
128
+ "<extra_id_122>",
129
+ "<extra_id_123>",
130
+ "<extra_id_124>"
131
+ ],
132
+ "eos_token": {
133
+ "__type": "AddedToken",
134
+ "content": "</s>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false
139
+ },
140
+ "extra_ids": 125,
141
+ "name_or_path": "google/byt5-large",
142
+ "pad_token": {
143
+ "__type": "AddedToken",
144
+ "content": "<pad>",
145
+ "lstrip": false,
146
+ "normalized": true,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ },
150
+ "special_tokens_map_file": "/storage/ukp/work/jbelouadi/.cache/huggingface/transformers/1d7794e7e6dcb9aff7f1b0f72b85d665041ce4cfc7c59cf78a0e83302f9ab8e3.063895353d5ef9e19a25220cb616c43abc5e84a2f11b1ffb71c29e097572a109",
151
+ "tokenizer_class": "ByGPT5Tokenizer",
152
+ "unk_token": {
153
+ "__type": "AddedToken",
154
+ "content": "<unk>",
155
+ "lstrip": false,
156
+ "normalized": true,
157
+ "rstrip": false,
158
+ "single_word": false
159
+ }
160
+ }
trainer_state.json ADDED
@@ -0,0 +1,1391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9999812503515559,
5
+ "global_step": 50000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 6e-07,
13
+ "loss": 8.3446,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.0,
18
+ "learning_rate": 0.00015,
19
+ "loss": 2.7695,
20
+ "step": 250
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 0.0003,
25
+ "loss": 1.1361,
26
+ "step": 500
27
+ },
28
+ {
29
+ "epoch": 0.01,
30
+ "learning_rate": 0.00029998111915108125,
31
+ "loss": 1.0271,
32
+ "step": 750
33
+ },
34
+ {
35
+ "epoch": 0.02,
36
+ "learning_rate": 0.00029992448135747777,
37
+ "loss": 0.9812,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 0.02,
42
+ "learning_rate": 0.0002998301008774512,
43
+ "loss": 0.9532,
44
+ "step": 1250
45
+ },
46
+ {
47
+ "epoch": 0.03,
48
+ "learning_rate": 0.00029969800147078263,
49
+ "loss": 0.9337,
50
+ "step": 1500
51
+ },
52
+ {
53
+ "epoch": 0.03,
54
+ "learning_rate": 0.00029952821639279135,
55
+ "loss": 0.9186,
56
+ "step": 1750
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 0.0002993207883859627,
61
+ "loss": 0.9058,
62
+ "step": 2000
63
+ },
64
+ {
65
+ "epoch": 0.04,
66
+ "learning_rate": 0.0002990757696691881,
67
+ "loss": 0.896,
68
+ "step": 2250
69
+ },
70
+ {
71
+ "epoch": 0.05,
72
+ "learning_rate": 0.00029879322192461925,
73
+ "loss": 0.8873,
74
+ "step": 2500
75
+ },
76
+ {
77
+ "epoch": 0.05,
78
+ "eval_loss": 0.8292228579521179,
79
+ "eval_runtime": 297.3658,
80
+ "eval_samples_per_second": 344.357,
81
+ "eval_steps_per_second": 5.381,
82
+ "step": 2500
83
+ },
84
+ {
85
+ "epoch": 0.05,
86
+ "learning_rate": 0.0002984732162821399,
87
+ "loss": 0.8794,
88
+ "step": 2750
89
+ },
90
+ {
91
+ "epoch": 0.06,
92
+ "learning_rate": 0.00029811583330145914,
93
+ "loss": 0.871,
94
+ "step": 3000
95
+ },
96
+ {
97
+ "epoch": 0.06,
98
+ "learning_rate": 0.0002977211629518312,
99
+ "loss": 0.8648,
100
+ "step": 3250
101
+ },
102
+ {
103
+ "epoch": 0.07,
104
+ "learning_rate": 0.00029728930458940595,
105
+ "loss": 0.8586,
106
+ "step": 3500
107
+ },
108
+ {
109
+ "epoch": 0.07,
110
+ "learning_rate": 0.0002968203669322168,
111
+ "loss": 0.8536,
112
+ "step": 3750
113
+ },
114
+ {
115
+ "epoch": 0.08,
116
+ "learning_rate": 0.00029631446803281107,
117
+ "loss": 0.8496,
118
+ "step": 4000
119
+ },
120
+ {
121
+ "epoch": 0.08,
122
+ "learning_rate": 0.00029577173524853123,
123
+ "loss": 0.8447,
124
+ "step": 4250
125
+ },
126
+ {
127
+ "epoch": 0.09,
128
+ "learning_rate": 0.0002951923052094534,
129
+ "loss": 0.8396,
130
+ "step": 4500
131
+ },
132
+ {
133
+ "epoch": 0.09,
134
+ "learning_rate": 0.00029457632378399127,
135
+ "loss": 0.8363,
136
+ "step": 4750
137
+ },
138
+ {
139
+ "epoch": 0.1,
140
+ "learning_rate": 0.0002939239460421746,
141
+ "loss": 0.8322,
142
+ "step": 5000
143
+ },
144
+ {
145
+ "epoch": 0.1,
146
+ "eval_loss": 0.7891861200332642,
147
+ "eval_runtime": 298.0718,
148
+ "eval_samples_per_second": 343.541,
149
+ "eval_steps_per_second": 5.368,
150
+ "step": 5000
151
+ },
152
+ {
153
+ "epoch": 0.1,
154
+ "learning_rate": 0.00029323533621661106,
155
+ "loss": 0.8288,
156
+ "step": 5250
157
+ },
158
+ {
159
+ "epoch": 0.11,
160
+ "learning_rate": 0.00029251066766114176,
161
+ "loss": 0.8251,
162
+ "step": 5500
163
+ },
164
+ {
165
+ "epoch": 0.11,
166
+ "learning_rate": 0.00029175012280720024,
167
+ "loss": 0.823,
168
+ "step": 5750
169
+ },
170
+ {
171
+ "epoch": 0.12,
172
+ "learning_rate": 0.0002909538931178862,
173
+ "loss": 0.8194,
174
+ "step": 6000
175
+ },
176
+ {
177
+ "epoch": 0.12,
178
+ "learning_rate": 0.000290122179039766,
179
+ "loss": 0.8169,
180
+ "step": 6250
181
+ },
182
+ {
183
+ "epoch": 0.13,
184
+ "learning_rate": 0.0002892551899524109,
185
+ "loss": 0.8139,
186
+ "step": 6500
187
+ },
188
+ {
189
+ "epoch": 0.13,
190
+ "learning_rate": 0.0002883531441156872,
191
+ "loss": 0.8108,
192
+ "step": 6750
193
+ },
194
+ {
195
+ "epoch": 0.14,
196
+ "learning_rate": 0.0002874162686148104,
197
+ "loss": 0.8089,
198
+ "step": 7000
199
+ },
200
+ {
201
+ "epoch": 0.14,
202
+ "learning_rate": 0.00028644479930317775,
203
+ "loss": 0.8071,
204
+ "step": 7250
205
+ },
206
+ {
207
+ "epoch": 0.15,
208
+ "learning_rate": 0.00028543898074299317,
209
+ "loss": 0.8038,
210
+ "step": 7500
211
+ },
212
+ {
213
+ "epoch": 0.15,
214
+ "eval_loss": 0.7673315405845642,
215
+ "eval_runtime": 298.1988,
216
+ "eval_samples_per_second": 343.395,
217
+ "eval_steps_per_second": 5.366,
218
+ "step": 7500
219
+ },
220
+ {
221
+ "epoch": 0.15,
222
+ "learning_rate": 0.00028439906614370034,
223
+ "loss": 0.8022,
224
+ "step": 7750
225
+ },
226
+ {
227
+ "epoch": 0.16,
228
+ "learning_rate": 0.0002833253172982385,
229
+ "loss": 0.8003,
230
+ "step": 8000
231
+ },
232
+ {
233
+ "epoch": 0.16,
234
+ "learning_rate": 0.0002822180045171373,
235
+ "loss": 0.7981,
236
+ "step": 8250
237
+ },
238
+ {
239
+ "epoch": 0.17,
240
+ "learning_rate": 0.0002810774065604677,
241
+ "loss": 0.7965,
242
+ "step": 8500
243
+ },
244
+ {
245
+ "epoch": 0.17,
246
+ "learning_rate": 0.0002799038105676658,
247
+ "loss": 0.7942,
248
+ "step": 8750
249
+ },
250
+ {
251
+ "epoch": 0.18,
252
+ "learning_rate": 0.0002786975119852465,
253
+ "loss": 0.7923,
254
+ "step": 9000
255
+ },
256
+ {
257
+ "epoch": 0.18,
258
+ "learning_rate": 0.00027745881449242713,
259
+ "loss": 0.791,
260
+ "step": 9250
261
+ },
262
+ {
263
+ "epoch": 0.19,
264
+ "learning_rate": 0.0002761880299246772,
265
+ "loss": 0.7898,
266
+ "step": 9500
267
+ },
268
+ {
269
+ "epoch": 0.19,
270
+ "learning_rate": 0.0002748854781952157,
271
+ "loss": 0.7883,
272
+ "step": 9750
273
+ },
274
+ {
275
+ "epoch": 0.2,
276
+ "learning_rate": 0.0002735514872144749,
277
+ "loss": 0.7867,
278
+ "step": 10000
279
+ },
280
+ {
281
+ "epoch": 0.2,
282
+ "eval_loss": 0.7522478699684143,
283
+ "eval_runtime": 298.1155,
284
+ "eval_samples_per_second": 343.491,
285
+ "eval_steps_per_second": 5.367,
286
+ "step": 10000
287
+ },
288
+ {
289
+ "epoch": 0.2,
290
+ "learning_rate": 0.0002721863928075503,
291
+ "loss": 0.7849,
292
+ "step": 10250
293
+ },
294
+ {
295
+ "epoch": 0.21,
296
+ "learning_rate": 0.00027079053862965875,
297
+ "loss": 0.7837,
298
+ "step": 10500
299
+ },
300
+ {
301
+ "epoch": 0.21,
302
+ "learning_rate": 0.0002693642760796248,
303
+ "loss": 0.7824,
304
+ "step": 10750
305
+ },
306
+ {
307
+ "epoch": 0.22,
308
+ "learning_rate": 0.00026790796421141813,
309
+ "loss": 0.781,
310
+ "step": 11000
311
+ },
312
+ {
313
+ "epoch": 0.22,
314
+ "learning_rate": 0.0002664219696437635,
315
+ "loss": 0.7793,
316
+ "step": 11250
317
+ },
318
+ {
319
+ "epoch": 0.23,
320
+ "learning_rate": 0.00026490666646784665,
321
+ "loss": 0.7776,
322
+ "step": 11500
323
+ },
324
+ {
325
+ "epoch": 0.23,
326
+ "learning_rate": 0.00026336243615313873,
327
+ "loss": 0.7768,
328
+ "step": 11750
329
+ },
330
+ {
331
+ "epoch": 0.24,
332
+ "learning_rate": 0.0002617896674513632,
333
+ "loss": 0.7747,
334
+ "step": 12000
335
+ },
336
+ {
337
+ "epoch": 0.24,
338
+ "learning_rate": 0.00026018875629862996,
339
+ "loss": 0.7743,
340
+ "step": 12250
341
+ },
342
+ {
343
+ "epoch": 0.25,
344
+ "learning_rate": 0.0002585601057157605,
345
+ "loss": 0.7731,
346
+ "step": 12500
347
+ },
348
+ {
349
+ "epoch": 0.25,
350
+ "eval_loss": 0.7414656281471252,
351
+ "eval_runtime": 297.2255,
352
+ "eval_samples_per_second": 344.52,
353
+ "eval_steps_per_second": 5.383,
354
+ "step": 12500
355
+ },
356
+ {
357
+ "epoch": 0.25,
358
+ "learning_rate": 0.00025690412570682946,
359
+ "loss": 0.7716,
360
+ "step": 12750
361
+ },
362
+ {
363
+ "epoch": 0.26,
364
+ "learning_rate": 0.0002552212331559482,
365
+ "loss": 0.7709,
366
+ "step": 13000
367
+ },
368
+ {
369
+ "epoch": 0.26,
370
+ "learning_rate": 0.0002535118517223168,
371
+ "loss": 0.7699,
372
+ "step": 13250
373
+ },
374
+ {
375
+ "epoch": 0.27,
376
+ "learning_rate": 0.0002517764117335698,
377
+ "loss": 0.7692,
378
+ "step": 13500
379
+ },
380
+ {
381
+ "epoch": 0.27,
382
+ "learning_rate": 0.00025001535007744373,
383
+ "loss": 0.7681,
384
+ "step": 13750
385
+ },
386
+ {
387
+ "epoch": 0.28,
388
+ "learning_rate": 0.00024822911009179276,
389
+ "loss": 0.7666,
390
+ "step": 14000
391
+ },
392
+ {
393
+ "epoch": 0.28,
394
+ "learning_rate": 0.0002464181414529809,
395
+ "loss": 0.7658,
396
+ "step": 14250
397
+ },
398
+ {
399
+ "epoch": 0.29,
400
+ "learning_rate": 0.00024458290006267833,
401
+ "loss": 0.7644,
402
+ "step": 14500
403
+ },
404
+ {
405
+ "epoch": 0.29,
406
+ "learning_rate": 0.00024272384793309077,
407
+ "loss": 0.7632,
408
+ "step": 14750
409
+ },
410
+ {
411
+ "epoch": 0.3,
412
+ "learning_rate": 0.00024084145307064997,
413
+ "loss": 0.7634,
414
+ "step": 15000
415
+ },
416
+ {
417
+ "epoch": 0.3,
418
+ "eval_loss": 0.7321411967277527,
419
+ "eval_runtime": 583.7459,
420
+ "eval_samples_per_second": 175.419,
421
+ "eval_steps_per_second": 5.482,
422
+ "step": 15000
423
+ },
424
+ {
425
+ "epoch": 0.3,
426
+ "learning_rate": 0.00023893618935819607,
427
+ "loss": 0.7624,
428
+ "step": 15250
429
+ },
430
+ {
431
+ "epoch": 0.31,
432
+ "learning_rate": 0.0002370085364356797,
433
+ "loss": 0.7609,
434
+ "step": 15500
435
+ },
436
+ {
437
+ "epoch": 0.31,
438
+ "learning_rate": 0.00023505897957941556,
439
+ "loss": 0.7609,
440
+ "step": 15750
441
+ },
442
+ {
443
+ "epoch": 0.32,
444
+ "learning_rate": 0.00023308800957991653,
445
+ "loss": 0.7593,
446
+ "step": 16000
447
+ },
448
+ {
449
+ "epoch": 0.32,
450
+ "learning_rate": 0.00023109612261833963,
451
+ "loss": 0.7592,
452
+ "step": 16250
453
+ },
454
+ {
455
+ "epoch": 0.33,
456
+ "learning_rate": 0.00022908382014157533,
457
+ "loss": 0.7586,
458
+ "step": 16500
459
+ },
460
+ {
461
+ "epoch": 0.33,
462
+ "learning_rate": 0.00022705160873601096,
463
+ "loss": 0.7574,
464
+ "step": 16750
465
+ },
466
+ {
467
+ "epoch": 0.34,
468
+ "learning_rate": 0.000225,
469
+ "loss": 0.7564,
470
+ "step": 17000
471
+ },
472
+ {
473
+ "epoch": 0.34,
474
+ "learning_rate": 0.00022292951041507028,
475
+ "loss": 0.7557,
476
+ "step": 17250
477
+ },
478
+ {
479
+ "epoch": 0.35,
480
+ "learning_rate": 0.0002208406612159024,
481
+ "loss": 0.7546,
482
+ "step": 17500
483
+ },
484
+ {
485
+ "epoch": 0.35,
486
+ "eval_loss": 0.7252368927001953,
487
+ "eval_runtime": 582.4208,
488
+ "eval_samples_per_second": 175.818,
489
+ "eval_steps_per_second": 5.494,
490
+ "step": 17500
491
+ },
492
+ {
493
+ "epoch": 0.35,
494
+ "learning_rate": 0.00021873397825911153,
495
+ "loss": 0.7545,
496
+ "step": 17750
497
+ },
498
+ {
499
+ "epoch": 0.36,
500
+ "learning_rate": 0.0002166099918908661,
501
+ "loss": 0.7535,
502
+ "step": 18000
503
+ },
504
+ {
505
+ "epoch": 0.36,
506
+ "learning_rate": 0.00021446923681337575,
507
+ "loss": 0.7531,
508
+ "step": 18250
509
+ },
510
+ {
511
+ "epoch": 0.37,
512
+ "learning_rate": 0.00021231225195028297,
513
+ "loss": 0.7526,
514
+ "step": 18500
515
+ },
516
+ {
517
+ "epoch": 0.37,
518
+ "learning_rate": 0.00021013958031099205,
519
+ "loss": 0.7514,
520
+ "step": 18750
521
+ },
522
+ {
523
+ "epoch": 0.38,
524
+ "learning_rate": 0.00020795176885396926,
525
+ "loss": 0.7512,
526
+ "step": 19000
527
+ },
528
+ {
529
+ "epoch": 0.38,
530
+ "learning_rate": 0.0002057493683490491,
531
+ "loss": 0.7505,
532
+ "step": 19250
533
+ },
534
+ {
535
+ "epoch": 0.39,
536
+ "learning_rate": 0.00020353293323878074,
537
+ "loss": 0.7504,
538
+ "step": 19500
539
+ },
540
+ {
541
+ "epoch": 0.39,
542
+ "learning_rate": 0.00020130302149885031,
543
+ "loss": 0.7488,
544
+ "step": 19750
545
+ },
546
+ {
547
+ "epoch": 0.4,
548
+ "learning_rate": 0.00019906019449761325,
549
+ "loss": 0.7484,
550
+ "step": 20000
551
+ },
552
+ {
553
+ "epoch": 0.4,
554
+ "eval_loss": 0.7194287776947021,
555
+ "eval_runtime": 581.861,
556
+ "eval_samples_per_second": 175.987,
557
+ "eval_steps_per_second": 5.5,
558
+ "step": 20000
559
+ },
560
+ {
561
+ "epoch": 0.4,
562
+ "learning_rate": 0.00019680501685477304,
563
+ "loss": 0.7477,
564
+ "step": 20250
565
+ },
566
+ {
567
+ "epoch": 0.41,
568
+ "learning_rate": 0.00019453805629924124,
569
+ "loss": 0.7467,
570
+ "step": 20500
571
+ },
572
+ {
573
+ "epoch": 0.41,
574
+ "learning_rate": 0.00019225988352621445,
575
+ "loss": 0.7467,
576
+ "step": 20750
577
+ },
578
+ {
579
+ "epoch": 0.42,
580
+ "learning_rate": 0.0001899710720535052,
581
+ "loss": 0.7456,
582
+ "step": 21000
583
+ },
584
+ {
585
+ "epoch": 0.42,
586
+ "learning_rate": 0.00018767219807716185,
587
+ "loss": 0.746,
588
+ "step": 21250
589
+ },
590
+ {
591
+ "epoch": 0.43,
592
+ "learning_rate": 0.0001853638403264141,
593
+ "loss": 0.7452,
594
+ "step": 21500
595
+ },
596
+ {
597
+ "epoch": 0.43,
598
+ "learning_rate": 0.0001830465799179811,
599
+ "loss": 0.7444,
600
+ "step": 21750
601
+ },
602
+ {
603
+ "epoch": 0.44,
604
+ "learning_rate": 0.0001807210002097786,
605
+ "loss": 0.7442,
606
+ "step": 22000
607
+ },
608
+ {
609
+ "epoch": 0.44,
610
+ "learning_rate": 0.0001783876866540615,
611
+ "loss": 0.7439,
612
+ "step": 22250
613
+ },
614
+ {
615
+ "epoch": 0.45,
616
+ "learning_rate": 0.00017604722665003956,
617
+ "loss": 0.7433,
618
+ "step": 22500
619
+ },
620
+ {
621
+ "epoch": 0.45,
622
+ "eval_loss": 0.7142770886421204,
623
+ "eval_runtime": 580.2963,
624
+ "eval_samples_per_second": 176.462,
625
+ "eval_steps_per_second": 5.514,
626
+ "step": 22500
627
+ },
628
+ {
629
+ "epoch": 0.45,
630
+ "learning_rate": 0.00017370020939600248,
631
+ "loss": 0.7417,
632
+ "step": 22750
633
+ },
634
+ {
635
+ "epoch": 0.46,
636
+ "learning_rate": 0.00017134722574099276,
637
+ "loss": 0.7417,
638
+ "step": 23000
639
+ },
640
+ {
641
+ "epoch": 0.46,
642
+ "learning_rate": 0.00016898886803606237,
643
+ "loss": 0.741,
644
+ "step": 23250
645
+ },
646
+ {
647
+ "epoch": 0.47,
648
+ "learning_rate": 0.00016662572998515164,
649
+ "loss": 0.7408,
650
+ "step": 23500
651
+ },
652
+ {
653
+ "epoch": 0.47,
654
+ "learning_rate": 0.00016425840649562736,
655
+ "loss": 0.7399,
656
+ "step": 23750
657
+ },
658
+ {
659
+ "epoch": 0.48,
660
+ "learning_rate": 0.00016188749352851825,
661
+ "loss": 0.7389,
662
+ "step": 24000
663
+ },
664
+ {
665
+ "epoch": 0.48,
666
+ "learning_rate": 0.00015951358794848465,
667
+ "loss": 0.7389,
668
+ "step": 24250
669
+ },
670
+ {
671
+ "epoch": 0.49,
672
+ "learning_rate": 0.00015713728737356137,
673
+ "loss": 0.7399,
674
+ "step": 24500
675
+ },
676
+ {
677
+ "epoch": 0.49,
678
+ "learning_rate": 0.00015475919002471016,
679
+ "loss": 0.7386,
680
+ "step": 24750
681
+ },
682
+ {
683
+ "epoch": 0.5,
684
+ "learning_rate": 0.00015237989457522118,
685
+ "loss": 0.7373,
686
+ "step": 25000
687
+ },
688
+ {
689
+ "epoch": 0.5,
690
+ "eval_loss": 0.7102295160293579,
691
+ "eval_runtime": 582.8326,
692
+ "eval_samples_per_second": 175.694,
693
+ "eval_steps_per_second": 5.49,
694
+ "step": 25000
695
+ },
696
+ {
697
+ "epoch": 0.5,
698
+ "learning_rate": 0.00015,
699
+ "loss": 0.7379,
700
+ "step": 25250
701
+ },
702
+ {
703
+ "epoch": 0.51,
704
+ "learning_rate": 0.0001476201054247788,
705
+ "loss": 0.7371,
706
+ "step": 25500
707
+ },
708
+ {
709
+ "epoch": 0.51,
710
+ "learning_rate": 0.00014524080997528987,
711
+ "loss": 0.7374,
712
+ "step": 25750
713
+ },
714
+ {
715
+ "epoch": 0.52,
716
+ "learning_rate": 0.00014286271262643866,
717
+ "loss": 0.7367,
718
+ "step": 26000
719
+ },
720
+ {
721
+ "epoch": 0.52,
722
+ "learning_rate": 0.00014048641205151533,
723
+ "loss": 0.7365,
724
+ "step": 26250
725
+ },
726
+ {
727
+ "epoch": 0.53,
728
+ "learning_rate": 0.0001381125064714817,
729
+ "loss": 0.7351,
730
+ "step": 26500
731
+ },
732
+ {
733
+ "epoch": 0.53,
734
+ "learning_rate": 0.00013574159350437261,
735
+ "loss": 0.7351,
736
+ "step": 26750
737
+ },
738
+ {
739
+ "epoch": 0.54,
740
+ "learning_rate": 0.00013337427001484836,
741
+ "loss": 0.7355,
742
+ "step": 27000
743
+ },
744
+ {
745
+ "epoch": 0.54,
746
+ "learning_rate": 0.00013101113196393758,
747
+ "loss": 0.7347,
748
+ "step": 27250
749
+ },
750
+ {
751
+ "epoch": 0.55,
752
+ "learning_rate": 0.00012865277425900724,
753
+ "loss": 0.7341,
754
+ "step": 27500
755
+ },
756
+ {
757
+ "epoch": 0.55,
758
+ "eval_loss": 0.7066481113433838,
759
+ "eval_runtime": 583.6904,
760
+ "eval_samples_per_second": 175.435,
761
+ "eval_steps_per_second": 5.482,
762
+ "step": 27500
763
+ },
764
+ {
765
+ "epoch": 0.55,
766
+ "learning_rate": 0.0001262997906039975,
767
+ "loss": 0.7346,
768
+ "step": 27750
769
+ },
770
+ {
771
+ "epoch": 0.56,
772
+ "learning_rate": 0.00012395277334996044,
773
+ "loss": 0.7329,
774
+ "step": 28000
775
+ },
776
+ {
777
+ "epoch": 0.56,
778
+ "learning_rate": 0.00012161231334593851,
779
+ "loss": 0.733,
780
+ "step": 28250
781
+ },
782
+ {
783
+ "epoch": 0.57,
784
+ "learning_rate": 0.0001192789997902214,
785
+ "loss": 0.7326,
786
+ "step": 28500
787
+ },
788
+ {
789
+ "epoch": 0.57,
790
+ "learning_rate": 0.00011695342008201888,
791
+ "loss": 0.7322,
792
+ "step": 28750
793
+ },
794
+ {
795
+ "epoch": 0.58,
796
+ "learning_rate": 0.00011463615967358588,
797
+ "loss": 0.7326,
798
+ "step": 29000
799
+ },
800
+ {
801
+ "epoch": 0.58,
802
+ "learning_rate": 0.00011232780192283812,
803
+ "loss": 0.732,
804
+ "step": 29250
805
+ },
806
+ {
807
+ "epoch": 0.59,
808
+ "learning_rate": 0.00011002892794649476,
809
+ "loss": 0.7311,
810
+ "step": 29500
811
+ },
812
+ {
813
+ "epoch": 0.59,
814
+ "learning_rate": 0.00010774011647378553,
815
+ "loss": 0.7312,
816
+ "step": 29750
817
+ },
818
+ {
819
+ "epoch": 0.6,
820
+ "learning_rate": 0.00010546194370075881,
821
+ "loss": 0.7304,
822
+ "step": 30000
823
+ },
824
+ {
825
+ "epoch": 0.6,
826
+ "eval_loss": 0.7039176225662231,
827
+ "eval_runtime": 583.6715,
828
+ "eval_samples_per_second": 175.441,
829
+ "eval_steps_per_second": 5.483,
830
+ "step": 30000
831
+ },
832
+ {
833
+ "epoch": 0.6,
834
+ "learning_rate": 0.00010319498314522693,
835
+ "loss": 0.7305,
836
+ "step": 30250
837
+ },
838
+ {
839
+ "epoch": 0.61,
840
+ "learning_rate": 0.00010093980550238675,
841
+ "loss": 0.7308,
842
+ "step": 30500
843
+ },
844
+ {
845
+ "epoch": 0.61,
846
+ "learning_rate": 9.869697850114969e-05,
847
+ "loss": 0.73,
848
+ "step": 30750
849
+ },
850
+ {
851
+ "epoch": 0.62,
852
+ "learning_rate": 9.646706676121923e-05,
853
+ "loss": 0.7303,
854
+ "step": 31000
855
+ },
856
+ {
857
+ "epoch": 0.62,
858
+ "learning_rate": 9.425063165095088e-05,
859
+ "loss": 0.7303,
860
+ "step": 31250
861
+ },
862
+ {
863
+ "epoch": 0.63,
864
+ "learning_rate": 9.204823114603068e-05,
865
+ "loss": 0.7289,
866
+ "step": 31500
867
+ },
868
+ {
869
+ "epoch": 0.63,
870
+ "learning_rate": 8.986041968900796e-05,
871
+ "loss": 0.7291,
872
+ "step": 31750
873
+ },
874
+ {
875
+ "epoch": 0.64,
876
+ "learning_rate": 8.768774804971705e-05,
877
+ "loss": 0.7289,
878
+ "step": 32000
879
+ },
880
+ {
881
+ "epoch": 0.64,
882
+ "learning_rate": 8.553076318662425e-05,
883
+ "loss": 0.7288,
884
+ "step": 32250
885
+ },
886
+ {
887
+ "epoch": 0.65,
888
+ "learning_rate": 8.339000810913386e-05,
889
+ "loss": 0.7276,
890
+ "step": 32500
891
+ },
892
+ {
893
+ "epoch": 0.65,
894
+ "eval_loss": 0.7014814615249634,
895
+ "eval_runtime": 583.4506,
896
+ "eval_samples_per_second": 175.508,
897
+ "eval_steps_per_second": 5.485,
898
+ "step": 32500
899
+ },
900
+ {
901
+ "epoch": 0.65,
902
+ "learning_rate": 8.126602174088843e-05,
903
+ "loss": 0.7281,
904
+ "step": 32750
905
+ },
906
+ {
907
+ "epoch": 0.66,
908
+ "learning_rate": 7.915933878409761e-05,
909
+ "loss": 0.7284,
910
+ "step": 33000
911
+ },
912
+ {
913
+ "epoch": 0.66,
914
+ "learning_rate": 7.707048958492972e-05,
915
+ "loss": 0.728,
916
+ "step": 33250
917
+ },
918
+ {
919
+ "epoch": 0.67,
920
+ "learning_rate": 7.500000000000002e-05,
921
+ "loss": 0.7262,
922
+ "step": 33500
923
+ },
924
+ {
925
+ "epoch": 0.67,
926
+ "learning_rate": 7.294839126398908e-05,
927
+ "loss": 0.7271,
928
+ "step": 33750
929
+ },
930
+ {
931
+ "epoch": 0.68,
932
+ "learning_rate": 7.091617985842462e-05,
933
+ "loss": 0.7268,
934
+ "step": 34000
935
+ },
936
+ {
937
+ "epoch": 0.68,
938
+ "learning_rate": 6.890387738166041e-05,
939
+ "loss": 0.7264,
940
+ "step": 34250
941
+ },
942
+ {
943
+ "epoch": 0.69,
944
+ "learning_rate": 6.691199042008345e-05,
945
+ "loss": 0.7266,
946
+ "step": 34500
947
+ },
948
+ {
949
+ "epoch": 0.69,
950
+ "learning_rate": 6.49410204205844e-05,
951
+ "loss": 0.7264,
952
+ "step": 34750
953
+ },
954
+ {
955
+ "epoch": 0.7,
956
+ "learning_rate": 6.299146356432029e-05,
957
+ "loss": 0.7256,
958
+ "step": 35000
959
+ },
960
+ {
961
+ "epoch": 0.7,
962
+ "eval_loss": 0.6993971467018127,
963
+ "eval_runtime": 583.2225,
964
+ "eval_samples_per_second": 175.576,
965
+ "eval_steps_per_second": 5.487,
966
+ "step": 35000
967
+ },
968
+ {
969
+ "epoch": 0.7,
970
+ "learning_rate": 6.106381064180395e-05,
971
+ "loss": 0.726,
972
+ "step": 35250
973
+ },
974
+ {
975
+ "epoch": 0.71,
976
+ "learning_rate": 5.915854692935002e-05,
977
+ "loss": 0.7256,
978
+ "step": 35500
979
+ },
980
+ {
981
+ "epoch": 0.71,
982
+ "learning_rate": 5.72761520669092e-05,
983
+ "loss": 0.7255,
984
+ "step": 35750
985
+ },
986
+ {
987
+ "epoch": 0.72,
988
+ "learning_rate": 5.541709993732167e-05,
989
+ "loss": 0.725,
990
+ "step": 36000
991
+ },
992
+ {
993
+ "epoch": 0.72,
994
+ "learning_rate": 5.358185854701909e-05,
995
+ "loss": 0.7254,
996
+ "step": 36250
997
+ },
998
+ {
999
+ "epoch": 0.73,
1000
+ "learning_rate": 5.1770889908207245e-05,
1001
+ "loss": 0.7253,
1002
+ "step": 36500
1003
+ },
1004
+ {
1005
+ "epoch": 0.73,
1006
+ "learning_rate": 4.998464992255627e-05,
1007
+ "loss": 0.7249,
1008
+ "step": 36750
1009
+ },
1010
+ {
1011
+ "epoch": 0.74,
1012
+ "learning_rate": 4.8223588266430186e-05,
1013
+ "loss": 0.7239,
1014
+ "step": 37000
1015
+ },
1016
+ {
1017
+ "epoch": 0.74,
1018
+ "learning_rate": 4.648814827768322e-05,
1019
+ "loss": 0.7246,
1020
+ "step": 37250
1021
+ },
1022
+ {
1023
+ "epoch": 0.75,
1024
+ "learning_rate": 4.477876684405179e-05,
1025
+ "loss": 0.7243,
1026
+ "step": 37500
1027
+ },
1028
+ {
1029
+ "epoch": 0.75,
1030
+ "eval_loss": 0.6978650689125061,
1031
+ "eval_runtime": 583.3741,
1032
+ "eval_samples_per_second": 175.531,
1033
+ "eval_steps_per_second": 5.485,
1034
+ "step": 37500
1035
+ },
1036
+ {
1037
+ "epoch": 0.75,
1038
+ "learning_rate": 4.309587429317061e-05,
1039
+ "loss": 0.7242,
1040
+ "step": 37750
1041
+ },
1042
+ {
1043
+ "epoch": 0.76,
1044
+ "learning_rate": 4.143989428423947e-05,
1045
+ "loss": 0.7247,
1046
+ "step": 38000
1047
+ },
1048
+ {
1049
+ "epoch": 0.76,
1050
+ "learning_rate": 3.981124370137001e-05,
1051
+ "loss": 0.7234,
1052
+ "step": 38250
1053
+ },
1054
+ {
1055
+ "epoch": 0.77,
1056
+ "learning_rate": 3.8210332548636796e-05,
1057
+ "loss": 0.7234,
1058
+ "step": 38500
1059
+ },
1060
+ {
1061
+ "epoch": 0.77,
1062
+ "learning_rate": 3.663756384686127e-05,
1063
+ "loss": 0.7238,
1064
+ "step": 38750
1065
+ },
1066
+ {
1067
+ "epoch": 0.78,
1068
+ "learning_rate": 3.509333353215331e-05,
1069
+ "loss": 0.7234,
1070
+ "step": 39000
1071
+ },
1072
+ {
1073
+ "epoch": 0.78,
1074
+ "learning_rate": 3.3578030356236455e-05,
1075
+ "loss": 0.7235,
1076
+ "step": 39250
1077
+ },
1078
+ {
1079
+ "epoch": 0.79,
1080
+ "learning_rate": 3.209203578858191e-05,
1081
+ "loss": 0.7238,
1082
+ "step": 39500
1083
+ },
1084
+ {
1085
+ "epoch": 0.79,
1086
+ "learning_rate": 3.0635723920375164e-05,
1087
+ "loss": 0.7234,
1088
+ "step": 39750
1089
+ },
1090
+ {
1091
+ "epoch": 0.8,
1092
+ "learning_rate": 2.9209461370341204e-05,
1093
+ "loss": 0.7226,
1094
+ "step": 40000
1095
+ },
1096
+ {
1097
+ "epoch": 0.8,
1098
+ "eval_loss": 0.6969777941703796,
1099
+ "eval_runtime": 583.6675,
1100
+ "eval_samples_per_second": 175.442,
1101
+ "eval_steps_per_second": 5.483,
1102
+ "step": 40000
1103
+ },
1104
+ {
1105
+ "epoch": 0.8,
1106
+ "learning_rate": 2.781360719244964e-05,
1107
+ "loss": 0.7233,
1108
+ "step": 40250
1109
+ },
1110
+ {
1111
+ "epoch": 0.81,
1112
+ "learning_rate": 2.6448512785525093e-05,
1113
+ "loss": 0.7227,
1114
+ "step": 40500
1115
+ },
1116
+ {
1117
+ "epoch": 0.81,
1118
+ "learning_rate": 2.5114521804784305e-05,
1119
+ "loss": 0.7231,
1120
+ "step": 40750
1121
+ },
1122
+ {
1123
+ "epoch": 0.82,
1124
+ "learning_rate": 2.38119700753228e-05,
1125
+ "loss": 0.723,
1126
+ "step": 41000
1127
+ },
1128
+ {
1129
+ "epoch": 0.82,
1130
+ "learning_rate": 2.2541185507572858e-05,
1131
+ "loss": 0.7229,
1132
+ "step": 41250
1133
+ },
1134
+ {
1135
+ "epoch": 0.83,
1136
+ "learning_rate": 2.130248801475344e-05,
1137
+ "loss": 0.7221,
1138
+ "step": 41500
1139
+ },
1140
+ {
1141
+ "epoch": 0.83,
1142
+ "learning_rate": 2.009618943233419e-05,
1143
+ "loss": 0.7227,
1144
+ "step": 41750
1145
+ },
1146
+ {
1147
+ "epoch": 0.84,
1148
+ "learning_rate": 1.892259343953226e-05,
1149
+ "loss": 0.7224,
1150
+ "step": 42000
1151
+ },
1152
+ {
1153
+ "epoch": 0.84,
1154
+ "learning_rate": 1.7781995482862705e-05,
1155
+ "loss": 0.7224,
1156
+ "step": 42250
1157
+ },
1158
+ {
1159
+ "epoch": 0.85,
1160
+ "learning_rate": 1.6674682701761493e-05,
1161
+ "loss": 0.7224,
1162
+ "step": 42500
1163
+ },
1164
+ {
1165
+ "epoch": 0.85,
1166
+ "eval_loss": 0.6960312724113464,
1167
+ "eval_runtime": 583.8116,
1168
+ "eval_samples_per_second": 175.399,
1169
+ "eval_steps_per_second": 5.481,
1170
+ "step": 42500
1171
+ },
1172
+ {
1173
+ "epoch": 0.85,
1174
+ "learning_rate": 1.5600933856299635e-05,
1175
+ "loss": 0.7222,
1176
+ "step": 42750
1177
+ },
1178
+ {
1179
+ "epoch": 0.86,
1180
+ "learning_rate": 1.4561019257006839e-05,
1181
+ "loss": 0.7221,
1182
+ "step": 43000
1183
+ },
1184
+ {
1185
+ "epoch": 0.86,
1186
+ "learning_rate": 1.3555200696822232e-05,
1187
+ "loss": 0.7222,
1188
+ "step": 43250
1189
+ },
1190
+ {
1191
+ "epoch": 0.87,
1192
+ "learning_rate": 1.258373138518956e-05,
1193
+ "loss": 0.7217,
1194
+ "step": 43500
1195
+ },
1196
+ {
1197
+ "epoch": 0.87,
1198
+ "learning_rate": 1.164685588431281e-05,
1199
+ "loss": 0.7215,
1200
+ "step": 43750
1201
+ },
1202
+ {
1203
+ "epoch": 0.88,
1204
+ "learning_rate": 1.0744810047589115e-05,
1205
+ "loss": 0.7214,
1206
+ "step": 44000
1207
+ },
1208
+ {
1209
+ "epoch": 0.88,
1210
+ "learning_rate": 9.877820960234002e-06,
1211
+ "loss": 0.7225,
1212
+ "step": 44250
1213
+ },
1214
+ {
1215
+ "epoch": 0.89,
1216
+ "learning_rate": 9.046106882113751e-06,
1217
+ "loss": 0.7226,
1218
+ "step": 44500
1219
+ },
1220
+ {
1221
+ "epoch": 0.89,
1222
+ "learning_rate": 8.249877192799731e-06,
1223
+ "loss": 0.722,
1224
+ "step": 44750
1225
+ },
1226
+ {
1227
+ "epoch": 0.9,
1228
+ "learning_rate": 7.489332338858201e-06,
1229
+ "loss": 0.7214,
1230
+ "step": 45000
1231
+ },
1232
+ {
1233
+ "epoch": 0.9,
1234
+ "eval_loss": 0.6956482529640198,
1235
+ "eval_runtime": 582.8898,
1236
+ "eval_samples_per_second": 175.676,
1237
+ "eval_steps_per_second": 5.49,
1238
+ "step": 45000
1239
+ },
1240
+ {
1241
+ "epoch": 0.9,
1242
+ "learning_rate": 6.764663783388918e-06,
1243
+ "loss": 0.7216,
1244
+ "step": 45250
1245
+ },
1246
+ {
1247
+ "epoch": 0.91,
1248
+ "learning_rate": 6.076053957825411e-06,
1249
+ "loss": 0.7225,
1250
+ "step": 45500
1251
+ },
1252
+ {
1253
+ "epoch": 0.91,
1254
+ "learning_rate": 5.4236762160086935e-06,
1255
+ "loss": 0.7214,
1256
+ "step": 45750
1257
+ },
1258
+ {
1259
+ "epoch": 0.92,
1260
+ "learning_rate": 4.807694790546562e-06,
1261
+ "loss": 0.7211,
1262
+ "step": 46000
1263
+ },
1264
+ {
1265
+ "epoch": 0.92,
1266
+ "learning_rate": 4.228264751468752e-06,
1267
+ "loss": 0.7215,
1268
+ "step": 46250
1269
+ },
1270
+ {
1271
+ "epoch": 0.93,
1272
+ "learning_rate": 3.6855319671889427e-06,
1273
+ "loss": 0.7218,
1274
+ "step": 46500
1275
+ },
1276
+ {
1277
+ "epoch": 0.93,
1278
+ "learning_rate": 3.179633067783205e-06,
1279
+ "loss": 0.7213,
1280
+ "step": 46750
1281
+ },
1282
+ {
1283
+ "epoch": 0.94,
1284
+ "learning_rate": 2.710695410593994e-06,
1285
+ "loss": 0.7211,
1286
+ "step": 47000
1287
+ },
1288
+ {
1289
+ "epoch": 0.94,
1290
+ "learning_rate": 2.2788370481687965e-06,
1291
+ "loss": 0.7208,
1292
+ "step": 47250
1293
+ },
1294
+ {
1295
+ "epoch": 0.95,
1296
+ "learning_rate": 1.8841666985408566e-06,
1297
+ "loss": 0.722,
1298
+ "step": 47500
1299
+ },
1300
+ {
1301
+ "epoch": 0.95,
1302
+ "eval_loss": 0.6955819129943848,
1303
+ "eval_runtime": 583.7524,
1304
+ "eval_samples_per_second": 175.417,
1305
+ "eval_steps_per_second": 5.482,
1306
+ "step": 47500
1307
+ },
1308
+ {
1309
+ "epoch": 0.95,
1310
+ "learning_rate": 1.5267837178600972e-06,
1311
+ "loss": 0.722,
1312
+ "step": 47750
1313
+ },
1314
+ {
1315
+ "epoch": 0.96,
1316
+ "learning_rate": 1.2067780753806988e-06,
1317
+ "loss": 0.7213,
1318
+ "step": 48000
1319
+ },
1320
+ {
1321
+ "epoch": 0.96,
1322
+ "learning_rate": 9.242303308118815e-07,
1323
+ "loss": 0.7215,
1324
+ "step": 48250
1325
+ },
1326
+ {
1327
+ "epoch": 0.97,
1328
+ "learning_rate": 6.792116140373116e-07,
1329
+ "loss": 0.7214,
1330
+ "step": 48500
1331
+ },
1332
+ {
1333
+ "epoch": 0.97,
1334
+ "learning_rate": 4.717836072086589e-07,
1335
+ "loss": 0.7211,
1336
+ "step": 48750
1337
+ },
1338
+ {
1339
+ "epoch": 0.98,
1340
+ "learning_rate": 3.01998529217351e-07,
1341
+ "loss": 0.7205,
1342
+ "step": 49000
1343
+ },
1344
+ {
1345
+ "epoch": 0.98,
1346
+ "learning_rate": 1.6989912254880556e-07,
1347
+ "loss": 0.721,
1348
+ "step": 49250
1349
+ },
1350
+ {
1351
+ "epoch": 0.99,
1352
+ "learning_rate": 7.551864252223761e-08,
1353
+ "loss": 0.7212,
1354
+ "step": 49500
1355
+ },
1356
+ {
1357
+ "epoch": 0.99,
1358
+ "learning_rate": 1.8880848918739756e-08,
1359
+ "loss": 0.7208,
1360
+ "step": 49750
1361
+ },
1362
+ {
1363
+ "epoch": 1.0,
1364
+ "learning_rate": 0.0,
1365
+ "loss": 0.7212,
1366
+ "step": 50000
1367
+ },
1368
+ {
1369
+ "epoch": 1.0,
1370
+ "eval_loss": 0.6954200267791748,
1371
+ "eval_runtime": 582.7829,
1372
+ "eval_samples_per_second": 175.709,
1373
+ "eval_steps_per_second": 5.491,
1374
+ "step": 50000
1375
+ },
1376
+ {
1377
+ "epoch": 1.0,
1378
+ "step": 50000,
1379
+ "total_flos": 4.5371203009812365e+19,
1380
+ "train_loss": 0.014421029052734374,
1381
+ "train_runtime": 10896.4792,
1382
+ "train_samples_per_second": 2349.383,
1383
+ "train_steps_per_second": 4.589
1384
+ }
1385
+ ],
1386
+ "max_steps": 50000,
1387
+ "num_train_epochs": 1,
1388
+ "total_flos": 4.5371203009812365e+19,
1389
+ "trial_name": null,
1390
+ "trial_params": null
1391
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:641ff185ddda2a4b44faabb6f6a79df6838d52cb9d83b841bcd6fde5ccf90020
3
+ size 3375