indiejoseph commited on
Commit
b814aaf
1 Parent(s): 1388220

Model save

Browse files
README.md CHANGED
@@ -1,23 +1,9 @@
1
  ---
2
- base_model: fnlp/bert-base-chinese
3
  tags:
4
  - generated_from_trainer
5
  model-index:
6
  - name: bart-base-cantonese
7
  results: []
8
- datasets:
9
- - indiejoseph/wikipedia-zh-yue-filtered
10
- - indiejoseph/cc100-yue
11
- - indiejoseph/ted-transcriptions-cantonese
12
- - indiejoseph/c4-cantonese-filtered
13
- - mozilla-foundation/common_voice_13_0
14
- - jed351/rthk_news
15
- - jed351/shikoto_zh_hk
16
- widget:
17
- - text: "今日去咗旺角[MASK]"
18
- example_title: "Mong Kok"
19
- - text: "今時今日香港係一個[MASK]。"
20
- example_title: "Hong Kong"
21
  ---
22
 
23
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -25,9 +11,11 @@ should probably proofread and complete it, then remove this comment. -->
25
 
26
  # bart-base-cantonese
27
 
28
- This model is a continue pre-train version of [fnlp/bart-base-chinese](https://huggingface.co/fnlp/bart-base-chinese) on filtered Cantonese common crawl dataset with 950M tokens.
29
 
30
- This tokenizer has extended the Bert tokenizer from fnlp/bart-base-chinese with 500 more Chinese characters commonly found in Cantonese
 
 
31
 
32
  ## Intended uses & limitations
33
 
@@ -57,7 +45,7 @@ The following hyperparameters were used during training:
57
 
58
  ### Framework versions
59
 
60
- - Transformers 4.35.0.dev0
61
  - Pytorch 2.1.1+cu121
62
  - Datasets 2.14.6
63
  - Tokenizers 0.14.1
 
1
  ---
 
2
  tags:
3
  - generated_from_trainer
4
  model-index:
5
  - name: bart-base-cantonese
6
  results: []
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ---
8
 
9
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
11
 
12
  # bart-base-cantonese
13
 
14
+ This model was trained from scratch on the None dataset.
15
 
16
+ ## Model description
17
+
18
+ More information needed
19
 
20
  ## Intended uses & limitations
21
 
 
45
 
46
  ### Framework versions
47
 
48
+ - Transformers 4.35.2
49
  - Pytorch 2.1.1+cu121
50
  - Datasets 2.14.6
51
  - Tokenizers 0.14.1
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_accuracy": 0.8416168286663981,
4
- "eval_loss": 0.8037419319152832,
5
- "eval_runtime": 188.7732,
6
- "eval_samples": 6205,
7
- "eval_samples_per_second": 32.87,
8
- "eval_steps_per_second": 2.055,
9
- "perplexity": 2.233884351646835,
10
- "train_loss": 0.9618661650365136,
11
- "train_runtime": 9694.4976,
12
- "train_samples": 63336,
13
- "train_samples_per_second": 19.6,
14
- "train_steps_per_second": 1.225
15
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.8386325284846604,
4
+ "eval_loss": 0.8430067300796509,
5
+ "eval_runtime": 339.0879,
6
+ "eval_samples": 11278,
7
+ "eval_samples_per_second": 33.26,
8
+ "eval_steps_per_second": 2.079,
9
+ "perplexity": 2.3233421480194,
10
+ "train_loss": 0.787189019458772,
11
+ "train_runtime": 32945.1529,
12
+ "train_samples": 648171,
13
+ "train_samples_per_second": 19.674,
14
+ "train_steps_per_second": 1.23
15
  }
config.json CHANGED
@@ -69,7 +69,7 @@
69
  },
70
  "tokenizer_class": "BertTokenizer",
71
  "torch_dtype": "float32",
72
- "transformers_version": "4.35.0.dev0",
73
  "use_cache": true,
74
  "vocab_size": 51371
75
  }
 
69
  },
70
  "tokenizer_class": "BertTokenizer",
71
  "torch_dtype": "float32",
72
+ "transformers_version": "4.35.2",
73
  "use_cache": true,
74
  "vocab_size": 51371
75
  }
eval_results.json CHANGED
@@ -1,10 +1,5 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_accuracy": 0.8416168286663981,
4
- "eval_loss": 0.8037419319152832,
5
- "eval_runtime": 188.7732,
6
- "eval_samples": 6205,
7
- "eval_samples_per_second": 32.87,
8
- "eval_steps_per_second": 2.055,
9
- "perplexity": 2.233884351646835
10
  }
 
1
  {
2
+ "eval_accuracy": 0.0,
3
+ "eval_loss": 11.697545369466146,
4
+ "eval_perplexity": 120276.11891132068
 
 
 
 
 
5
  }
generation_config.json CHANGED
@@ -8,5 +8,5 @@
8
  "no_repeat_ngram_size": 3,
9
  "num_beams": 4,
10
  "pad_token_id": 0,
11
- "transformers_version": "4.35.0.dev0"
12
  }
 
8
  "no_repeat_ngram_size": 3,
9
  "num_beams": 4,
10
  "pad_token_id": 0,
11
+ "transformers_version": "4.35.2"
12
  }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cd8dc13b00393f4be5ae61eea43edb23b5f9cc6437f8979391a0a677f2519d1
3
+ size 561314676
runs/Nov28_14-00-45_44d3998eda2b/events.out.tfevents.1701180578.44d3998eda2b.3099222.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1de4ee578d88c3ccacef55afb9a31c2f9e2f78be1ae90047244709a6ff0520f9
3
+ size 5428
runs/Nov28_14-24-09_44d3998eda2b/events.out.tfevents.1701183372.44d3998eda2b.3101657.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:633b472eb60ffec309488358ef2154daac75bf51ebb63e5c5b0c9344d401dcb2
3
+ size 9981
runs/Nov28_18-18-01_44d3998eda2b/events.out.tfevents.1701197416.44d3998eda2b.3136596.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d870af4911b82236fc110d03aded1716e3e54bc74e8d10b3178bce21c0b6cfee
3
+ size 5899
runs/Nov28_19-12-15_44d3998eda2b/events.out.tfevents.1701200666.44d3998eda2b.3141349.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dc98660544a599d020a0670457cdf57aa621524f6608c822116daeb79e6843f
3
+ size 18811
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "train_loss": 0.9618661650365136,
4
- "train_runtime": 9694.4976,
5
- "train_samples": 63336,
6
- "train_samples_per_second": 19.6,
7
- "train_steps_per_second": 1.225
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.787189019458772,
4
+ "train_runtime": 32945.1529,
5
+ "train_samples": 648171,
6
+ "train_samples_per_second": 19.674,
7
+ "train_steps_per_second": 1.23
8
  }
trainer_state.json CHANGED
@@ -1,166 +1,514 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 2000.0,
6
- "global_step": 11877,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.13,
13
- "learning_rate": 2.5e-05,
14
- "loss": 1.5023,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.25,
19
- "learning_rate": 5e-05,
20
- "loss": 1.124,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.38,
25
- "learning_rate": 7.500000000000001e-05,
26
- "loss": 1.0616,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.51,
31
- "learning_rate": 0.0001,
32
- "loss": 1.0334,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.63,
37
- "learning_rate": 9.49377341297965e-05,
38
- "loss": 1.0149,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.76,
43
- "learning_rate": 8.9875468259593e-05,
44
- "loss": 0.9938,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.88,
49
- "learning_rate": 8.481320238938948e-05,
50
- "loss": 0.987,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 1.01,
55
- "learning_rate": 7.975093651918599e-05,
56
- "loss": 0.97,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 1.14,
61
- "learning_rate": 7.468867064898249e-05,
62
- "loss": 0.9497,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 1.26,
67
- "learning_rate": 6.962640477877899e-05,
68
- "loss": 0.9438,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 1.39,
73
- "learning_rate": 6.456413890857548e-05,
74
- "loss": 0.938,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 1.52,
79
- "learning_rate": 5.950187303837198e-05,
80
- "loss": 0.9261,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 1.64,
85
- "learning_rate": 5.443960716816847e-05,
86
- "loss": 0.924,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 1.77,
91
- "learning_rate": 4.937734129796497e-05,
92
- "loss": 0.9165,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 1.89,
97
- "learning_rate": 4.431507542776147e-05,
98
- "loss": 0.9034,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 2.02,
103
- "learning_rate": 3.925280955755796e-05,
104
- "loss": 0.8966,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 2.15,
109
- "learning_rate": 3.419054368735446e-05,
110
- "loss": 0.8829,
111
  "step": 8500
112
  },
113
  {
114
- "epoch": 2.27,
115
- "learning_rate": 2.9128277817150957e-05,
116
- "loss": 0.8843,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 2.4,
121
- "learning_rate": 2.4066011946947456e-05,
122
- "loss": 0.8725,
123
  "step": 9500
124
  },
125
  {
126
- "epoch": 2.53,
127
- "learning_rate": 1.900374607674395e-05,
128
- "loss": 0.8712,
129
  "step": 10000
130
  },
131
  {
132
- "epoch": 2.65,
133
- "learning_rate": 1.3941480206540447e-05,
134
- "loss": 0.8673,
135
  "step": 10500
136
  },
137
  {
138
- "epoch": 2.78,
139
- "learning_rate": 8.879214336336945e-06,
140
- "loss": 0.8669,
141
  "step": 11000
142
  },
143
  {
144
- "epoch": 2.9,
145
- "learning_rate": 3.816948466133442e-06,
146
- "loss": 0.8679,
147
  "step": 11500
148
  },
149
  {
150
- "epoch": 3.0,
151
- "step": 11877,
152
- "total_flos": 5.792740247863296e+16,
153
- "train_loss": 0.9618661650365136,
154
- "train_runtime": 9694.4976,
155
- "train_samples_per_second": 19.6,
156
- "train_steps_per_second": 1.225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  }
158
  ],
159
  "logging_steps": 500,
160
- "max_steps": 11877,
161
- "num_train_epochs": 3,
162
  "save_steps": 2000,
163
- "total_flos": 5.792740247863296e+16,
164
  "trial_name": null,
165
  "trial_params": null
166
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 2000.0,
6
+ "global_step": 40511,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "learning_rate": 6.16979269496545e-06,
14
+ "loss": 1.3917,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.02,
19
+ "learning_rate": 1.23395853899309e-05,
20
+ "loss": 1.0351,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.04,
25
+ "learning_rate": 1.8509378084896346e-05,
26
+ "loss": 0.9747,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.05,
31
+ "learning_rate": 2.46791707798618e-05,
32
+ "loss": 0.9373,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.06,
37
+ "learning_rate": 3.084896347482725e-05,
38
+ "loss": 0.9124,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.07,
43
+ "learning_rate": 3.701875616979269e-05,
44
+ "loss": 0.8946,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.09,
49
+ "learning_rate": 4.318854886475814e-05,
50
+ "loss": 0.8819,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.1,
55
+ "learning_rate": 4.93583415597236e-05,
56
+ "loss": 0.8723,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.11,
61
+ "learning_rate": 4.9385611234537426e-05,
62
+ "loss": 0.8615,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.12,
67
+ "learning_rate": 4.869990948736938e-05,
68
+ "loss": 0.8553,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.14,
73
+ "learning_rate": 4.801420774020132e-05,
74
+ "loss": 0.8466,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.15,
79
+ "learning_rate": 4.732850599303327e-05,
80
+ "loss": 0.8412,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 0.16,
85
+ "learning_rate": 4.6642804245865215e-05,
86
+ "loss": 0.8331,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 0.17,
91
+ "learning_rate": 4.5957102498697166e-05,
92
+ "loss": 0.8252,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 0.19,
97
+ "learning_rate": 4.527140075152912e-05,
98
+ "loss": 0.8164,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 0.2,
103
+ "learning_rate": 4.458569900436107e-05,
104
+ "loss": 0.8139,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 0.21,
109
+ "learning_rate": 4.389999725719302e-05,
110
+ "loss": 0.8072,
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 0.22,
115
+ "learning_rate": 4.321429551002496e-05,
116
+ "loss": 0.8096,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 0.23,
121
+ "learning_rate": 4.252859376285691e-05,
122
+ "loss": 0.8026,
123
  "step": 9500
124
  },
125
  {
126
+ "epoch": 0.25,
127
+ "learning_rate": 4.1842892015688856e-05,
128
+ "loss": 0.8027,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 0.26,
133
+ "learning_rate": 4.115719026852081e-05,
134
+ "loss": 0.7952,
135
  "step": 10500
136
  },
137
  {
138
+ "epoch": 0.27,
139
+ "learning_rate": 4.047148852135275e-05,
140
+ "loss": 0.7936,
141
  "step": 11000
142
  },
143
  {
144
+ "epoch": 0.28,
145
+ "learning_rate": 3.97857867741847e-05,
146
+ "loss": 0.7913,
147
  "step": 11500
148
  },
149
  {
150
+ "epoch": 0.3,
151
+ "learning_rate": 3.910008502701665e-05,
152
+ "loss": 0.7896,
153
+ "step": 12000
154
+ },
155
+ {
156
+ "epoch": 0.31,
157
+ "learning_rate": 3.8414383279848596e-05,
158
+ "loss": 0.7857,
159
+ "step": 12500
160
+ },
161
+ {
162
+ "epoch": 0.32,
163
+ "learning_rate": 3.7728681532680547e-05,
164
+ "loss": 0.7842,
165
+ "step": 13000
166
+ },
167
+ {
168
+ "epoch": 0.33,
169
+ "learning_rate": 3.70429797855125e-05,
170
+ "loss": 0.7813,
171
+ "step": 13500
172
+ },
173
+ {
174
+ "epoch": 0.35,
175
+ "learning_rate": 3.635727803834445e-05,
176
+ "loss": 0.7812,
177
+ "step": 14000
178
+ },
179
+ {
180
+ "epoch": 0.36,
181
+ "learning_rate": 3.567157629117639e-05,
182
+ "loss": 0.7793,
183
+ "step": 14500
184
+ },
185
+ {
186
+ "epoch": 0.37,
187
+ "learning_rate": 3.498587454400834e-05,
188
+ "loss": 0.7721,
189
+ "step": 15000
190
+ },
191
+ {
192
+ "epoch": 0.38,
193
+ "learning_rate": 3.4300172796840286e-05,
194
+ "loss": 0.7726,
195
+ "step": 15500
196
+ },
197
+ {
198
+ "epoch": 0.39,
199
+ "learning_rate": 3.361447104967224e-05,
200
+ "loss": 0.7714,
201
+ "step": 16000
202
+ },
203
+ {
204
+ "epoch": 0.41,
205
+ "learning_rate": 3.292876930250419e-05,
206
+ "loss": 0.7737,
207
+ "step": 16500
208
+ },
209
+ {
210
+ "epoch": 0.42,
211
+ "learning_rate": 3.224306755533613e-05,
212
+ "loss": 0.7661,
213
+ "step": 17000
214
+ },
215
+ {
216
+ "epoch": 0.43,
217
+ "learning_rate": 3.155736580816808e-05,
218
+ "loss": 0.7675,
219
+ "step": 17500
220
+ },
221
+ {
222
+ "epoch": 0.44,
223
+ "learning_rate": 3.0871664061000026e-05,
224
+ "loss": 0.7688,
225
+ "step": 18000
226
+ },
227
+ {
228
+ "epoch": 0.46,
229
+ "learning_rate": 3.0185962313831976e-05,
230
+ "loss": 0.7637,
231
+ "step": 18500
232
+ },
233
+ {
234
+ "epoch": 0.47,
235
+ "learning_rate": 2.9500260566663924e-05,
236
+ "loss": 0.7644,
237
+ "step": 19000
238
+ },
239
+ {
240
+ "epoch": 0.48,
241
+ "learning_rate": 2.8814558819495874e-05,
242
+ "loss": 0.7632,
243
+ "step": 19500
244
+ },
245
+ {
246
+ "epoch": 0.49,
247
+ "learning_rate": 2.8128857072327825e-05,
248
+ "loss": 0.7607,
249
+ "step": 20000
250
+ },
251
+ {
252
+ "epoch": 0.51,
253
+ "learning_rate": 2.744315532515977e-05,
254
+ "loss": 0.7614,
255
+ "step": 20500
256
+ },
257
+ {
258
+ "epoch": 0.52,
259
+ "learning_rate": 2.675745357799172e-05,
260
+ "loss": 0.7602,
261
+ "step": 21000
262
+ },
263
+ {
264
+ "epoch": 0.53,
265
+ "learning_rate": 2.6071751830823667e-05,
266
+ "loss": 0.7564,
267
+ "step": 21500
268
+ },
269
+ {
270
+ "epoch": 0.54,
271
+ "learning_rate": 2.5386050083655617e-05,
272
+ "loss": 0.7574,
273
+ "step": 22000
274
+ },
275
+ {
276
+ "epoch": 0.56,
277
+ "learning_rate": 2.4700348336487565e-05,
278
+ "loss": 0.7534,
279
+ "step": 22500
280
+ },
281
+ {
282
+ "epoch": 0.57,
283
+ "learning_rate": 2.4014646589319512e-05,
284
+ "loss": 0.7567,
285
+ "step": 23000
286
+ },
287
+ {
288
+ "epoch": 0.58,
289
+ "learning_rate": 2.332894484215146e-05,
290
+ "loss": 0.7534,
291
+ "step": 23500
292
+ },
293
+ {
294
+ "epoch": 0.59,
295
+ "learning_rate": 2.2643243094983406e-05,
296
+ "loss": 0.7555,
297
+ "step": 24000
298
+ },
299
+ {
300
+ "epoch": 0.6,
301
+ "learning_rate": 2.1957541347815357e-05,
302
+ "loss": 0.7534,
303
+ "step": 24500
304
+ },
305
+ {
306
+ "epoch": 0.62,
307
+ "learning_rate": 2.1271839600647304e-05,
308
+ "loss": 0.7475,
309
+ "step": 25000
310
+ },
311
+ {
312
+ "epoch": 0.63,
313
+ "learning_rate": 2.058613785347925e-05,
314
+ "loss": 0.7482,
315
+ "step": 25500
316
+ },
317
+ {
318
+ "epoch": 0.64,
319
+ "learning_rate": 1.99004361063112e-05,
320
+ "loss": 0.7475,
321
+ "step": 26000
322
+ },
323
+ {
324
+ "epoch": 0.65,
325
+ "learning_rate": 1.9214734359143146e-05,
326
+ "loss": 0.7479,
327
+ "step": 26500
328
+ },
329
+ {
330
+ "epoch": 0.67,
331
+ "learning_rate": 1.8529032611975096e-05,
332
+ "loss": 0.7433,
333
+ "step": 27000
334
+ },
335
+ {
336
+ "epoch": 0.68,
337
+ "learning_rate": 1.7843330864807044e-05,
338
+ "loss": 0.7473,
339
+ "step": 27500
340
+ },
341
+ {
342
+ "epoch": 0.69,
343
+ "learning_rate": 1.7157629117638994e-05,
344
+ "loss": 0.7439,
345
+ "step": 28000
346
+ },
347
+ {
348
+ "epoch": 0.7,
349
+ "learning_rate": 1.647192737047094e-05,
350
+ "loss": 0.7423,
351
+ "step": 28500
352
+ },
353
+ {
354
+ "epoch": 0.72,
355
+ "learning_rate": 1.578622562330289e-05,
356
+ "loss": 0.7429,
357
+ "step": 29000
358
+ },
359
+ {
360
+ "epoch": 0.73,
361
+ "learning_rate": 1.5100523876134836e-05,
362
+ "loss": 0.7424,
363
+ "step": 29500
364
+ },
365
+ {
366
+ "epoch": 0.74,
367
+ "learning_rate": 1.4414822128966785e-05,
368
+ "loss": 0.7408,
369
+ "step": 30000
370
+ },
371
+ {
372
+ "epoch": 0.75,
373
+ "learning_rate": 1.3729120381798732e-05,
374
+ "loss": 0.7402,
375
+ "step": 30500
376
+ },
377
+ {
378
+ "epoch": 0.77,
379
+ "learning_rate": 1.3043418634630683e-05,
380
+ "loss": 0.7376,
381
+ "step": 31000
382
+ },
383
+ {
384
+ "epoch": 0.78,
385
+ "learning_rate": 1.235771688746263e-05,
386
+ "loss": 0.7371,
387
+ "step": 31500
388
+ },
389
+ {
390
+ "epoch": 0.79,
391
+ "learning_rate": 1.1672015140294577e-05,
392
+ "loss": 0.741,
393
+ "step": 32000
394
+ },
395
+ {
396
+ "epoch": 0.8,
397
+ "learning_rate": 1.0986313393126526e-05,
398
+ "loss": 0.7354,
399
+ "step": 32500
400
+ },
401
+ {
402
+ "epoch": 0.81,
403
+ "learning_rate": 1.0300611645958475e-05,
404
+ "loss": 0.7343,
405
+ "step": 33000
406
+ },
407
+ {
408
+ "epoch": 0.83,
409
+ "learning_rate": 9.614909898790423e-06,
410
+ "loss": 0.7357,
411
+ "step": 33500
412
+ },
413
+ {
414
+ "epoch": 0.84,
415
+ "learning_rate": 8.92920815162237e-06,
416
+ "loss": 0.7319,
417
+ "step": 34000
418
+ },
419
+ {
420
+ "epoch": 0.85,
421
+ "learning_rate": 8.243506404454319e-06,
422
+ "loss": 0.7339,
423
+ "step": 34500
424
+ },
425
+ {
426
+ "epoch": 0.86,
427
+ "learning_rate": 7.557804657286268e-06,
428
+ "loss": 0.732,
429
+ "step": 35000
430
+ },
431
+ {
432
+ "epoch": 0.88,
433
+ "learning_rate": 6.872102910118215e-06,
434
+ "loss": 0.7352,
435
+ "step": 35500
436
+ },
437
+ {
438
+ "epoch": 0.89,
439
+ "learning_rate": 6.186401162950164e-06,
440
+ "loss": 0.7364,
441
+ "step": 36000
442
+ },
443
+ {
444
+ "epoch": 0.9,
445
+ "learning_rate": 5.500699415782112e-06,
446
+ "loss": 0.733,
447
+ "step": 36500
448
+ },
449
+ {
450
+ "epoch": 0.91,
451
+ "learning_rate": 4.814997668614059e-06,
452
+ "loss": 0.7318,
453
+ "step": 37000
454
+ },
455
+ {
456
+ "epoch": 0.93,
457
+ "learning_rate": 4.129295921446008e-06,
458
+ "loss": 0.7333,
459
+ "step": 37500
460
+ },
461
+ {
462
+ "epoch": 0.94,
463
+ "learning_rate": 3.4435941742779558e-06,
464
+ "loss": 0.7358,
465
+ "step": 38000
466
+ },
467
+ {
468
+ "epoch": 0.95,
469
+ "learning_rate": 2.7578924271099047e-06,
470
+ "loss": 0.7332,
471
+ "step": 38500
472
+ },
473
+ {
474
+ "epoch": 0.96,
475
+ "learning_rate": 2.0721906799418528e-06,
476
+ "loss": 0.7325,
477
+ "step": 39000
478
+ },
479
+ {
480
+ "epoch": 0.98,
481
+ "learning_rate": 1.3864889327738007e-06,
482
+ "loss": 0.7282,
483
+ "step": 39500
484
+ },
485
+ {
486
+ "epoch": 0.99,
487
+ "learning_rate": 7.00787185605749e-07,
488
+ "loss": 0.731,
489
+ "step": 40000
490
+ },
491
+ {
492
+ "epoch": 1.0,
493
+ "learning_rate": 1.508543843769714e-08,
494
+ "loss": 0.7318,
495
+ "step": 40500
496
+ },
497
+ {
498
+ "epoch": 1.0,
499
+ "step": 40511,
500
+ "total_flos": 1.9760674493693952e+17,
501
+ "train_loss": 0.787189019458772,
502
+ "train_runtime": 32945.1529,
503
+ "train_samples_per_second": 19.674,
504
+ "train_steps_per_second": 1.23
505
  }
506
  ],
507
  "logging_steps": 500,
508
+ "max_steps": 40511,
509
+ "num_train_epochs": 1,
510
  "save_steps": 2000,
511
+ "total_flos": 1.9760674493693952e+17,
512
  "trial_name": null,
513
  "trial_params": null
514
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b73eb5f3d3b5c7077b84e7429b72db499f4db630cec7723c33cab47fcfc7056
3
- size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f321738d7dc9125e15c79e15f46839e735df9d313bee21b88fbd1a2628cf64c1
3
+ size 4664