AlanDlink commited on
Commit
6c2ddf6
1 Parent(s): 8e12c1f

End of training

Browse files
README.md CHANGED
@@ -1,24 +1,27 @@
1
  ---
 
 
2
  license: apache-2.0
3
- base_model: openai/whisper-tiny
4
  tags:
 
5
  - generated_from_trainer
6
- metrics:
7
- - wer
 
8
  model-index:
9
- - name: whisper-tiny-tw
10
  results: []
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
- # whisper-tiny-tw
17
 
18
- This model is a fine-tuned version of [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.4244
21
- - Wer: 72.7331
22
 
23
  ## Model description
24
 
@@ -51,17 +54,22 @@ The following hyperparameters were used during training:
51
 
52
  ### Training results
53
 
54
- | Training Loss | Epoch | Step | Validation Loss | Wer |
55
- |:-------------:|:-----:|:----:|:---------------:|:-------:|
56
- | 0.3086 | 1.42 | 1000 | 0.4288 | 80.0170 |
57
- | 0.2124 | 2.84 | 2000 | 0.4129 | 80.4205 |
58
- | 0.1149 | 4.26 | 3000 | 0.4217 | 76.7042 |
59
- | 0.0932 | 5.67 | 4000 | 0.4244 | 72.7331 |
 
 
 
 
60
 
61
 
62
  ### Framework versions
63
 
 
64
  - Transformers 4.36.2
65
  - Pytorch 2.1.2+cu121
66
  - Datasets 2.16.0
67
- - Tokenizers 0.15.0
 
1
  ---
2
+ language:
3
+ - zh
4
  license: apache-2.0
5
+ library_name: peft
6
  tags:
7
+ - hf-asr-leaderboard
8
  - generated_from_trainer
9
+ datasets:
10
+ - mozilla-foundation/common_voice_15_0
11
+ base_model: openai/whisper-tiny
12
  model-index:
13
+ - name: Whisper tiny TW - AlanDlink
14
  results: []
15
  ---
16
 
17
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
  should probably proofread and complete it, then remove this comment. -->
19
 
20
+ # Whisper tiny TW - AlanDlink
21
 
22
+ This model is a fine-tuned version of [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) on the Common Voice 15.0 dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 0.6078
 
25
 
26
  ## Model description
27
 
 
54
 
55
  ### Training results
56
 
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:-----:|:----:|:---------------:|
59
+ | 3.3802 | 0.67 | 500 | 3.3992 |
60
+ | 2.1962 | 1.33 | 1000 | 2.1643 |
61
+ | 1.4348 | 2.0 | 1500 | 1.4068 |
62
+ | 0.7108 | 2.67 | 2000 | 0.6926 |
63
+ | 0.6801 | 3.33 | 2500 | 0.6374 |
64
+ | 0.6273 | 4.0 | 3000 | 0.6195 |
65
+ | 0.6001 | 4.67 | 3500 | 0.6106 |
66
+ | 0.6082 | 5.33 | 4000 | 0.6078 |
67
 
68
 
69
  ### Framework versions
70
 
71
+ - PEFT 0.7.1
72
  - Transformers 4.36.2
73
  - Pytorch 2.1.2+cu121
74
  - Datasets 2.16.0
75
+ - Tokenizers 0.15.0
adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "WhisperForConditionalGeneration",
5
+ "parent_library": "transformers.models.whisper.modeling_whisper"
6
+ },
7
+ "base_model_name_or_path": "openai/whisper-tiny",
8
+ "beta1": 0.85,
9
+ "beta2": 0.85,
10
+ "bias": "none",
11
+ "deltaT": 10,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "init_r": 12,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_dropout": 0.1,
21
+ "megatron_config": null,
22
+ "megatron_core": "megatron.core",
23
+ "modules_to_save": null,
24
+ "orth_reg_weight": 0.5,
25
+ "peft_type": "ADALORA",
26
+ "r": 8,
27
+ "rank_pattern": null,
28
+ "revision": null,
29
+ "target_modules": [
30
+ "k_proj",
31
+ "q_proj",
32
+ "v_proj",
33
+ "out_proj",
34
+ "fc1",
35
+ "fc2"
36
+ ],
37
+ "target_r": 4,
38
+ "task_type": null,
39
+ "tfinal": 1000,
40
+ "tinit": 200,
41
+ "total_step": null
42
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aa2de5188de449d3e07313935fc694b663aaf185889731811c43c1b433203ff
3
+ size 3271824
runs/Jan04_14-43-41_Edge-Ai/events.out.tfevents.1704350621.Edge-Ai.21394.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0da2015bdeeb2d4d27e812cf84519c7672b4dfb3eb4fa13d794d54116fa02d3
3
+ size 5173
runs/Jan04_14-45-40_Edge-Ai/events.out.tfevents.1704350740.Edge-Ai.21458.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24505eeb42cf804f8b3d2f7b6da646581f88a28b929d91ce0c6d0215a5b37b13
3
+ size 10026
runs/Jan04_14-59-59_Edge-Ai/events.out.tfevents.1704351599.Edge-Ai.22055.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ea7bf7147916a07c5553af34ff680e6ccc7d3cdabdf7f3bc065ae4357ad1f02
3
+ size 5494
runs/Jan04_15-02-09_Edge-Ai/events.out.tfevents.1704351729.Edge-Ai.22199.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf96c6335a034925dc81881b966fe311a920a95f501261a93533bfa8d1cc6385
3
+ size 8580
runs/Jan04_15-17-26_Edge-Ai/events.out.tfevents.1704352646.Edge-Ai.22701.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c6c35ebf393d6265cf409a06f628f7377e19b43378daf2a0d8778b3607c9202
3
+ size 8580
runs/Jan04_15-29-18_Edge-Ai/events.out.tfevents.1704353359.Edge-Ai.23773.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6831260c4a014e89620288926d725242f0b19adcceea10d4259801b92b5d845
3
+ size 32811
trainer_state.json ADDED
@@ -0,0 +1,1054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6078237891197205,
3
+ "best_model_checkpoint": "./whisper-tiny-tw/checkpoint-4000",
4
+ "epoch": 5.333333333333333,
5
+ "eval_steps": 500,
6
+ "global_step": 4000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "learning_rate": 4.4e-07,
14
+ "loss": 3.8753,
15
+ "step": 25
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 9.400000000000001e-07,
20
+ "loss": 3.7563,
21
+ "step": 50
22
+ },
23
+ {
24
+ "epoch": 0.1,
25
+ "learning_rate": 1.44e-06,
26
+ "loss": 3.8738,
27
+ "step": 75
28
+ },
29
+ {
30
+ "epoch": 0.13,
31
+ "learning_rate": 1.94e-06,
32
+ "loss": 3.8201,
33
+ "step": 100
34
+ },
35
+ {
36
+ "epoch": 0.17,
37
+ "learning_rate": 2.4400000000000004e-06,
38
+ "loss": 3.8439,
39
+ "step": 125
40
+ },
41
+ {
42
+ "epoch": 0.2,
43
+ "learning_rate": 2.9400000000000002e-06,
44
+ "loss": 3.817,
45
+ "step": 150
46
+ },
47
+ {
48
+ "epoch": 0.23,
49
+ "learning_rate": 3.44e-06,
50
+ "loss": 3.8385,
51
+ "step": 175
52
+ },
53
+ {
54
+ "epoch": 0.27,
55
+ "learning_rate": 3.94e-06,
56
+ "loss": 3.9375,
57
+ "step": 200
58
+ },
59
+ {
60
+ "epoch": 0.3,
61
+ "learning_rate": 4.440000000000001e-06,
62
+ "loss": 3.8171,
63
+ "step": 225
64
+ },
65
+ {
66
+ "epoch": 0.33,
67
+ "learning_rate": 4.94e-06,
68
+ "loss": 3.8155,
69
+ "step": 250
70
+ },
71
+ {
72
+ "epoch": 0.37,
73
+ "learning_rate": 5.4400000000000004e-06,
74
+ "loss": 3.7763,
75
+ "step": 275
76
+ },
77
+ {
78
+ "epoch": 0.4,
79
+ "learning_rate": 5.94e-06,
80
+ "loss": 3.7395,
81
+ "step": 300
82
+ },
83
+ {
84
+ "epoch": 0.43,
85
+ "learning_rate": 6.440000000000001e-06,
86
+ "loss": 3.6626,
87
+ "step": 325
88
+ },
89
+ {
90
+ "epoch": 0.47,
91
+ "learning_rate": 6.9400000000000005e-06,
92
+ "loss": 3.6135,
93
+ "step": 350
94
+ },
95
+ {
96
+ "epoch": 0.5,
97
+ "learning_rate": 7.440000000000001e-06,
98
+ "loss": 3.6064,
99
+ "step": 375
100
+ },
101
+ {
102
+ "epoch": 0.53,
103
+ "learning_rate": 7.94e-06,
104
+ "loss": 3.6924,
105
+ "step": 400
106
+ },
107
+ {
108
+ "epoch": 0.57,
109
+ "learning_rate": 8.44e-06,
110
+ "loss": 3.5364,
111
+ "step": 425
112
+ },
113
+ {
114
+ "epoch": 0.6,
115
+ "learning_rate": 8.94e-06,
116
+ "loss": 3.621,
117
+ "step": 450
118
+ },
119
+ {
120
+ "epoch": 0.63,
121
+ "learning_rate": 9.42e-06,
122
+ "loss": 3.4745,
123
+ "step": 475
124
+ },
125
+ {
126
+ "epoch": 0.67,
127
+ "learning_rate": 9.920000000000002e-06,
128
+ "loss": 3.3802,
129
+ "step": 500
130
+ },
131
+ {
132
+ "epoch": 0.67,
133
+ "eval_loss": 3.399221658706665,
134
+ "eval_runtime": 217.7852,
135
+ "eval_samples_per_second": 22.729,
136
+ "eval_steps_per_second": 5.684,
137
+ "step": 500
138
+ },
139
+ {
140
+ "epoch": 0.7,
141
+ "learning_rate": 9.940000000000001e-06,
142
+ "loss": 3.3565,
143
+ "step": 525
144
+ },
145
+ {
146
+ "epoch": 0.73,
147
+ "learning_rate": 9.86857142857143e-06,
148
+ "loss": 3.2815,
149
+ "step": 550
150
+ },
151
+ {
152
+ "epoch": 0.77,
153
+ "learning_rate": 9.797142857142858e-06,
154
+ "loss": 3.252,
155
+ "step": 575
156
+ },
157
+ {
158
+ "epoch": 0.8,
159
+ "learning_rate": 9.725714285714287e-06,
160
+ "loss": 3.1434,
161
+ "step": 600
162
+ },
163
+ {
164
+ "epoch": 0.83,
165
+ "learning_rate": 9.654285714285716e-06,
166
+ "loss": 3.1253,
167
+ "step": 625
168
+ },
169
+ {
170
+ "epoch": 0.87,
171
+ "learning_rate": 9.582857142857143e-06,
172
+ "loss": 3.1106,
173
+ "step": 650
174
+ },
175
+ {
176
+ "epoch": 0.9,
177
+ "learning_rate": 9.511428571428572e-06,
178
+ "loss": 3.025,
179
+ "step": 675
180
+ },
181
+ {
182
+ "epoch": 0.93,
183
+ "learning_rate": 9.440000000000001e-06,
184
+ "loss": 2.9601,
185
+ "step": 700
186
+ },
187
+ {
188
+ "epoch": 0.97,
189
+ "learning_rate": 9.368571428571428e-06,
190
+ "loss": 2.8374,
191
+ "step": 725
192
+ },
193
+ {
194
+ "epoch": 1.0,
195
+ "learning_rate": 9.297142857142857e-06,
196
+ "loss": 2.7712,
197
+ "step": 750
198
+ },
199
+ {
200
+ "epoch": 1.03,
201
+ "learning_rate": 9.225714285714286e-06,
202
+ "loss": 2.6499,
203
+ "step": 775
204
+ },
205
+ {
206
+ "epoch": 1.07,
207
+ "learning_rate": 9.154285714285715e-06,
208
+ "loss": 2.6006,
209
+ "step": 800
210
+ },
211
+ {
212
+ "epoch": 1.1,
213
+ "learning_rate": 9.082857142857143e-06,
214
+ "loss": 2.5892,
215
+ "step": 825
216
+ },
217
+ {
218
+ "epoch": 1.13,
219
+ "learning_rate": 9.011428571428572e-06,
220
+ "loss": 2.4973,
221
+ "step": 850
222
+ },
223
+ {
224
+ "epoch": 1.17,
225
+ "learning_rate": 8.94e-06,
226
+ "loss": 2.4601,
227
+ "step": 875
228
+ },
229
+ {
230
+ "epoch": 1.2,
231
+ "learning_rate": 8.86857142857143e-06,
232
+ "loss": 2.3739,
233
+ "step": 900
234
+ },
235
+ {
236
+ "epoch": 1.23,
237
+ "learning_rate": 8.797142857142857e-06,
238
+ "loss": 2.3396,
239
+ "step": 925
240
+ },
241
+ {
242
+ "epoch": 1.27,
243
+ "learning_rate": 8.725714285714286e-06,
244
+ "loss": 2.3263,
245
+ "step": 950
246
+ },
247
+ {
248
+ "epoch": 1.3,
249
+ "learning_rate": 8.654285714285715e-06,
250
+ "loss": 2.2193,
251
+ "step": 975
252
+ },
253
+ {
254
+ "epoch": 1.33,
255
+ "learning_rate": 8.582857142857144e-06,
256
+ "loss": 2.1962,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 1.33,
261
+ "eval_loss": 2.1643242835998535,
262
+ "eval_runtime": 223.3057,
263
+ "eval_samples_per_second": 22.167,
264
+ "eval_steps_per_second": 5.544,
265
+ "step": 1000
266
+ },
267
+ {
268
+ "epoch": 1.37,
269
+ "learning_rate": 8.511428571428571e-06,
270
+ "loss": 2.1077,
271
+ "step": 1025
272
+ },
273
+ {
274
+ "epoch": 1.4,
275
+ "learning_rate": 8.44e-06,
276
+ "loss": 2.1055,
277
+ "step": 1050
278
+ },
279
+ {
280
+ "epoch": 1.43,
281
+ "learning_rate": 8.36857142857143e-06,
282
+ "loss": 2.0465,
283
+ "step": 1075
284
+ },
285
+ {
286
+ "epoch": 1.47,
287
+ "learning_rate": 8.297142857142859e-06,
288
+ "loss": 1.9805,
289
+ "step": 1100
290
+ },
291
+ {
292
+ "epoch": 1.5,
293
+ "learning_rate": 8.225714285714288e-06,
294
+ "loss": 2.0129,
295
+ "step": 1125
296
+ },
297
+ {
298
+ "epoch": 1.53,
299
+ "learning_rate": 8.154285714285715e-06,
300
+ "loss": 1.9447,
301
+ "step": 1150
302
+ },
303
+ {
304
+ "epoch": 1.57,
305
+ "learning_rate": 8.082857142857144e-06,
306
+ "loss": 1.9103,
307
+ "step": 1175
308
+ },
309
+ {
310
+ "epoch": 1.6,
311
+ "learning_rate": 8.011428571428573e-06,
312
+ "loss": 1.8697,
313
+ "step": 1200
314
+ },
315
+ {
316
+ "epoch": 1.63,
317
+ "learning_rate": 7.94e-06,
318
+ "loss": 1.8626,
319
+ "step": 1225
320
+ },
321
+ {
322
+ "epoch": 1.67,
323
+ "learning_rate": 7.86857142857143e-06,
324
+ "loss": 1.8127,
325
+ "step": 1250
326
+ },
327
+ {
328
+ "epoch": 1.7,
329
+ "learning_rate": 7.797142857142858e-06,
330
+ "loss": 1.8026,
331
+ "step": 1275
332
+ },
333
+ {
334
+ "epoch": 1.73,
335
+ "learning_rate": 7.725714285714286e-06,
336
+ "loss": 1.7526,
337
+ "step": 1300
338
+ },
339
+ {
340
+ "epoch": 1.77,
341
+ "learning_rate": 7.654285714285715e-06,
342
+ "loss": 1.6291,
343
+ "step": 1325
344
+ },
345
+ {
346
+ "epoch": 1.8,
347
+ "learning_rate": 7.5828571428571444e-06,
348
+ "loss": 1.6233,
349
+ "step": 1350
350
+ },
351
+ {
352
+ "epoch": 1.83,
353
+ "learning_rate": 7.511428571428572e-06,
354
+ "loss": 1.6377,
355
+ "step": 1375
356
+ },
357
+ {
358
+ "epoch": 1.87,
359
+ "learning_rate": 7.440000000000001e-06,
360
+ "loss": 1.5758,
361
+ "step": 1400
362
+ },
363
+ {
364
+ "epoch": 1.9,
365
+ "learning_rate": 7.36857142857143e-06,
366
+ "loss": 1.5483,
367
+ "step": 1425
368
+ },
369
+ {
370
+ "epoch": 1.93,
371
+ "learning_rate": 7.297142857142858e-06,
372
+ "loss": 1.5166,
373
+ "step": 1450
374
+ },
375
+ {
376
+ "epoch": 1.97,
377
+ "learning_rate": 7.225714285714286e-06,
378
+ "loss": 1.4319,
379
+ "step": 1475
380
+ },
381
+ {
382
+ "epoch": 2.0,
383
+ "learning_rate": 7.154285714285715e-06,
384
+ "loss": 1.4348,
385
+ "step": 1500
386
+ },
387
+ {
388
+ "epoch": 2.0,
389
+ "eval_loss": 1.4068281650543213,
390
+ "eval_runtime": 219.1085,
391
+ "eval_samples_per_second": 22.592,
392
+ "eval_steps_per_second": 5.65,
393
+ "step": 1500
394
+ },
395
+ {
396
+ "epoch": 2.03,
397
+ "learning_rate": 7.082857142857143e-06,
398
+ "loss": 1.3035,
399
+ "step": 1525
400
+ },
401
+ {
402
+ "epoch": 2.07,
403
+ "learning_rate": 7.011428571428572e-06,
404
+ "loss": 1.3554,
405
+ "step": 1550
406
+ },
407
+ {
408
+ "epoch": 2.1,
409
+ "learning_rate": 6.9400000000000005e-06,
410
+ "loss": 1.3143,
411
+ "step": 1575
412
+ },
413
+ {
414
+ "epoch": 2.13,
415
+ "learning_rate": 6.868571428571429e-06,
416
+ "loss": 1.2939,
417
+ "step": 1600
418
+ },
419
+ {
420
+ "epoch": 2.17,
421
+ "learning_rate": 6.797142857142858e-06,
422
+ "loss": 1.2099,
423
+ "step": 1625
424
+ },
425
+ {
426
+ "epoch": 2.2,
427
+ "learning_rate": 6.725714285714287e-06,
428
+ "loss": 1.2248,
429
+ "step": 1650
430
+ },
431
+ {
432
+ "epoch": 2.23,
433
+ "learning_rate": 6.654285714285716e-06,
434
+ "loss": 1.1217,
435
+ "step": 1675
436
+ },
437
+ {
438
+ "epoch": 2.27,
439
+ "learning_rate": 6.582857142857143e-06,
440
+ "loss": 1.1274,
441
+ "step": 1700
442
+ },
443
+ {
444
+ "epoch": 2.3,
445
+ "learning_rate": 6.511428571428572e-06,
446
+ "loss": 1.0399,
447
+ "step": 1725
448
+ },
449
+ {
450
+ "epoch": 2.33,
451
+ "learning_rate": 6.440000000000001e-06,
452
+ "loss": 1.046,
453
+ "step": 1750
454
+ },
455
+ {
456
+ "epoch": 2.37,
457
+ "learning_rate": 6.371428571428572e-06,
458
+ "loss": 0.9206,
459
+ "step": 1775
460
+ },
461
+ {
462
+ "epoch": 2.4,
463
+ "learning_rate": 6.300000000000001e-06,
464
+ "loss": 0.9125,
465
+ "step": 1800
466
+ },
467
+ {
468
+ "epoch": 2.43,
469
+ "learning_rate": 6.22857142857143e-06,
470
+ "loss": 0.9636,
471
+ "step": 1825
472
+ },
473
+ {
474
+ "epoch": 2.47,
475
+ "learning_rate": 6.157142857142858e-06,
476
+ "loss": 0.8434,
477
+ "step": 1850
478
+ },
479
+ {
480
+ "epoch": 2.5,
481
+ "learning_rate": 6.085714285714286e-06,
482
+ "loss": 0.7642,
483
+ "step": 1875
484
+ },
485
+ {
486
+ "epoch": 2.53,
487
+ "learning_rate": 6.014285714285715e-06,
488
+ "loss": 0.7749,
489
+ "step": 1900
490
+ },
491
+ {
492
+ "epoch": 2.57,
493
+ "learning_rate": 5.942857142857143e-06,
494
+ "loss": 0.7122,
495
+ "step": 1925
496
+ },
497
+ {
498
+ "epoch": 2.6,
499
+ "learning_rate": 5.871428571428572e-06,
500
+ "loss": 0.7065,
501
+ "step": 1950
502
+ },
503
+ {
504
+ "epoch": 2.63,
505
+ "learning_rate": 5.8e-06,
506
+ "loss": 0.7062,
507
+ "step": 1975
508
+ },
509
+ {
510
+ "epoch": 2.67,
511
+ "learning_rate": 5.7285714285714285e-06,
512
+ "loss": 0.7108,
513
+ "step": 2000
514
+ },
515
+ {
516
+ "epoch": 2.67,
517
+ "eval_loss": 0.6925591230392456,
518
+ "eval_runtime": 219.8232,
519
+ "eval_samples_per_second": 22.518,
520
+ "eval_steps_per_second": 5.632,
521
+ "step": 2000
522
+ },
523
+ {
524
+ "epoch": 2.7,
525
+ "learning_rate": 5.6571428571428576e-06,
526
+ "loss": 0.7476,
527
+ "step": 2025
528
+ },
529
+ {
530
+ "epoch": 2.73,
531
+ "learning_rate": 5.5857142857142866e-06,
532
+ "loss": 0.6809,
533
+ "step": 2050
534
+ },
535
+ {
536
+ "epoch": 2.77,
537
+ "learning_rate": 5.514285714285714e-06,
538
+ "loss": 0.625,
539
+ "step": 2075
540
+ },
541
+ {
542
+ "epoch": 2.8,
543
+ "learning_rate": 5.442857142857143e-06,
544
+ "loss": 0.6623,
545
+ "step": 2100
546
+ },
547
+ {
548
+ "epoch": 2.83,
549
+ "learning_rate": 5.371428571428572e-06,
550
+ "loss": 0.7041,
551
+ "step": 2125
552
+ },
553
+ {
554
+ "epoch": 2.87,
555
+ "learning_rate": 5.300000000000001e-06,
556
+ "loss": 0.6609,
557
+ "step": 2150
558
+ },
559
+ {
560
+ "epoch": 2.9,
561
+ "learning_rate": 5.22857142857143e-06,
562
+ "loss": 0.7054,
563
+ "step": 2175
564
+ },
565
+ {
566
+ "epoch": 2.93,
567
+ "learning_rate": 5.157142857142857e-06,
568
+ "loss": 0.694,
569
+ "step": 2200
570
+ },
571
+ {
572
+ "epoch": 2.97,
573
+ "learning_rate": 5.085714285714286e-06,
574
+ "loss": 0.702,
575
+ "step": 2225
576
+ },
577
+ {
578
+ "epoch": 3.0,
579
+ "learning_rate": 5.014285714285715e-06,
580
+ "loss": 0.6537,
581
+ "step": 2250
582
+ },
583
+ {
584
+ "epoch": 3.03,
585
+ "learning_rate": 4.9428571428571435e-06,
586
+ "loss": 0.6367,
587
+ "step": 2275
588
+ },
589
+ {
590
+ "epoch": 3.07,
591
+ "learning_rate": 4.871428571428572e-06,
592
+ "loss": 0.6481,
593
+ "step": 2300
594
+ },
595
+ {
596
+ "epoch": 3.1,
597
+ "learning_rate": 4.800000000000001e-06,
598
+ "loss": 0.6755,
599
+ "step": 2325
600
+ },
601
+ {
602
+ "epoch": 3.13,
603
+ "learning_rate": 4.728571428571429e-06,
604
+ "loss": 0.6257,
605
+ "step": 2350
606
+ },
607
+ {
608
+ "epoch": 3.17,
609
+ "learning_rate": 4.657142857142857e-06,
610
+ "loss": 0.6438,
611
+ "step": 2375
612
+ },
613
+ {
614
+ "epoch": 3.2,
615
+ "learning_rate": 4.585714285714286e-06,
616
+ "loss": 0.7029,
617
+ "step": 2400
618
+ },
619
+ {
620
+ "epoch": 3.23,
621
+ "learning_rate": 4.514285714285714e-06,
622
+ "loss": 0.6453,
623
+ "step": 2425
624
+ },
625
+ {
626
+ "epoch": 3.27,
627
+ "learning_rate": 4.442857142857143e-06,
628
+ "loss": 0.6728,
629
+ "step": 2450
630
+ },
631
+ {
632
+ "epoch": 3.3,
633
+ "learning_rate": 4.371428571428572e-06,
634
+ "loss": 0.6667,
635
+ "step": 2475
636
+ },
637
+ {
638
+ "epoch": 3.33,
639
+ "learning_rate": 4.3e-06,
640
+ "loss": 0.6801,
641
+ "step": 2500
642
+ },
643
+ {
644
+ "epoch": 3.33,
645
+ "eval_loss": 0.6374496221542358,
646
+ "eval_runtime": 222.1867,
647
+ "eval_samples_per_second": 22.279,
648
+ "eval_steps_per_second": 5.572,
649
+ "step": 2500
650
+ },
651
+ {
652
+ "epoch": 3.37,
653
+ "learning_rate": 4.228571428571429e-06,
654
+ "loss": 0.6021,
655
+ "step": 2525
656
+ },
657
+ {
658
+ "epoch": 3.4,
659
+ "learning_rate": 4.1571428571428575e-06,
660
+ "loss": 0.6657,
661
+ "step": 2550
662
+ },
663
+ {
664
+ "epoch": 3.43,
665
+ "learning_rate": 4.0857142857142865e-06,
666
+ "loss": 0.6338,
667
+ "step": 2575
668
+ },
669
+ {
670
+ "epoch": 3.47,
671
+ "learning_rate": 4.014285714285715e-06,
672
+ "loss": 0.6559,
673
+ "step": 2600
674
+ },
675
+ {
676
+ "epoch": 3.5,
677
+ "learning_rate": 3.942857142857143e-06,
678
+ "loss": 0.6559,
679
+ "step": 2625
680
+ },
681
+ {
682
+ "epoch": 3.53,
683
+ "learning_rate": 3.871428571428572e-06,
684
+ "loss": 0.6894,
685
+ "step": 2650
686
+ },
687
+ {
688
+ "epoch": 3.57,
689
+ "learning_rate": 3.8000000000000005e-06,
690
+ "loss": 0.6422,
691
+ "step": 2675
692
+ },
693
+ {
694
+ "epoch": 3.6,
695
+ "learning_rate": 3.7285714285714286e-06,
696
+ "loss": 0.6211,
697
+ "step": 2700
698
+ },
699
+ {
700
+ "epoch": 3.63,
701
+ "learning_rate": 3.6571428571428576e-06,
702
+ "loss": 0.6228,
703
+ "step": 2725
704
+ },
705
+ {
706
+ "epoch": 3.67,
707
+ "learning_rate": 3.5857142857142862e-06,
708
+ "loss": 0.5956,
709
+ "step": 2750
710
+ },
711
+ {
712
+ "epoch": 3.7,
713
+ "learning_rate": 3.5142857142857144e-06,
714
+ "loss": 0.6002,
715
+ "step": 2775
716
+ },
717
+ {
718
+ "epoch": 3.73,
719
+ "learning_rate": 3.4428571428571434e-06,
720
+ "loss": 0.6384,
721
+ "step": 2800
722
+ },
723
+ {
724
+ "epoch": 3.77,
725
+ "learning_rate": 3.3714285714285716e-06,
726
+ "loss": 0.6283,
727
+ "step": 2825
728
+ },
729
+ {
730
+ "epoch": 3.8,
731
+ "learning_rate": 3.3000000000000006e-06,
732
+ "loss": 0.6691,
733
+ "step": 2850
734
+ },
735
+ {
736
+ "epoch": 3.83,
737
+ "learning_rate": 3.2285714285714288e-06,
738
+ "loss": 0.5512,
739
+ "step": 2875
740
+ },
741
+ {
742
+ "epoch": 3.87,
743
+ "learning_rate": 3.1571428571428573e-06,
744
+ "loss": 0.5573,
745
+ "step": 2900
746
+ },
747
+ {
748
+ "epoch": 3.9,
749
+ "learning_rate": 3.085714285714286e-06,
750
+ "loss": 0.5772,
751
+ "step": 2925
752
+ },
753
+ {
754
+ "epoch": 3.93,
755
+ "learning_rate": 3.0142857142857145e-06,
756
+ "loss": 0.6307,
757
+ "step": 2950
758
+ },
759
+ {
760
+ "epoch": 3.97,
761
+ "learning_rate": 2.9428571428571427e-06,
762
+ "loss": 0.5967,
763
+ "step": 2975
764
+ },
765
+ {
766
+ "epoch": 4.0,
767
+ "learning_rate": 2.8714285714285717e-06,
768
+ "loss": 0.6273,
769
+ "step": 3000
770
+ },
771
+ {
772
+ "epoch": 4.0,
773
+ "eval_loss": 0.6194672584533691,
774
+ "eval_runtime": 222.9187,
775
+ "eval_samples_per_second": 22.205,
776
+ "eval_steps_per_second": 5.554,
777
+ "step": 3000
778
+ },
779
+ {
780
+ "epoch": 4.03,
781
+ "learning_rate": 2.8000000000000003e-06,
782
+ "loss": 0.5915,
783
+ "step": 3025
784
+ },
785
+ {
786
+ "epoch": 4.07,
787
+ "learning_rate": 2.728571428571429e-06,
788
+ "loss": 0.6411,
789
+ "step": 3050
790
+ },
791
+ {
792
+ "epoch": 4.1,
793
+ "learning_rate": 2.6571428571428575e-06,
794
+ "loss": 0.6522,
795
+ "step": 3075
796
+ },
797
+ {
798
+ "epoch": 4.13,
799
+ "learning_rate": 2.5857142857142856e-06,
800
+ "loss": 0.5862,
801
+ "step": 3100
802
+ },
803
+ {
804
+ "epoch": 4.17,
805
+ "learning_rate": 2.5142857142857147e-06,
806
+ "loss": 0.5907,
807
+ "step": 3125
808
+ },
809
+ {
810
+ "epoch": 4.2,
811
+ "learning_rate": 2.442857142857143e-06,
812
+ "loss": 0.6228,
813
+ "step": 3150
814
+ },
815
+ {
816
+ "epoch": 4.23,
817
+ "learning_rate": 2.371428571428572e-06,
818
+ "loss": 0.639,
819
+ "step": 3175
820
+ },
821
+ {
822
+ "epoch": 4.27,
823
+ "learning_rate": 2.3000000000000004e-06,
824
+ "loss": 0.6105,
825
+ "step": 3200
826
+ },
827
+ {
828
+ "epoch": 4.3,
829
+ "learning_rate": 2.228571428571429e-06,
830
+ "loss": 0.6154,
831
+ "step": 3225
832
+ },
833
+ {
834
+ "epoch": 4.33,
835
+ "learning_rate": 2.157142857142857e-06,
836
+ "loss": 0.6179,
837
+ "step": 3250
838
+ },
839
+ {
840
+ "epoch": 4.37,
841
+ "learning_rate": 2.0857142857142858e-06,
842
+ "loss": 0.5826,
843
+ "step": 3275
844
+ },
845
+ {
846
+ "epoch": 4.4,
847
+ "learning_rate": 2.0142857142857144e-06,
848
+ "loss": 0.6063,
849
+ "step": 3300
850
+ },
851
+ {
852
+ "epoch": 4.43,
853
+ "learning_rate": 1.942857142857143e-06,
854
+ "loss": 0.6412,
855
+ "step": 3325
856
+ },
857
+ {
858
+ "epoch": 4.47,
859
+ "learning_rate": 1.8714285714285715e-06,
860
+ "loss": 0.5746,
861
+ "step": 3350
862
+ },
863
+ {
864
+ "epoch": 4.5,
865
+ "learning_rate": 1.8000000000000001e-06,
866
+ "loss": 0.6644,
867
+ "step": 3375
868
+ },
869
+ {
870
+ "epoch": 4.53,
871
+ "learning_rate": 1.7285714285714287e-06,
872
+ "loss": 0.5814,
873
+ "step": 3400
874
+ },
875
+ {
876
+ "epoch": 4.57,
877
+ "learning_rate": 1.657142857142857e-06,
878
+ "loss": 0.6841,
879
+ "step": 3425
880
+ },
881
+ {
882
+ "epoch": 4.6,
883
+ "learning_rate": 1.5857142857142857e-06,
884
+ "loss": 0.6066,
885
+ "step": 3450
886
+ },
887
+ {
888
+ "epoch": 4.63,
889
+ "learning_rate": 1.5142857142857145e-06,
890
+ "loss": 0.5992,
891
+ "step": 3475
892
+ },
893
+ {
894
+ "epoch": 4.67,
895
+ "learning_rate": 1.442857142857143e-06,
896
+ "loss": 0.6001,
897
+ "step": 3500
898
+ },
899
+ {
900
+ "epoch": 4.67,
901
+ "eval_loss": 0.6105530858039856,
902
+ "eval_runtime": 216.5509,
903
+ "eval_samples_per_second": 22.858,
904
+ "eval_steps_per_second": 5.717,
905
+ "step": 3500
906
+ },
907
+ {
908
+ "epoch": 4.7,
909
+ "learning_rate": 1.3714285714285717e-06,
910
+ "loss": 0.6524,
911
+ "step": 3525
912
+ },
913
+ {
914
+ "epoch": 4.73,
915
+ "learning_rate": 1.3e-06,
916
+ "loss": 0.5826,
917
+ "step": 3550
918
+ },
919
+ {
920
+ "epoch": 4.77,
921
+ "learning_rate": 1.2285714285714286e-06,
922
+ "loss": 0.5674,
923
+ "step": 3575
924
+ },
925
+ {
926
+ "epoch": 4.8,
927
+ "learning_rate": 1.1571428571428572e-06,
928
+ "loss": 0.5817,
929
+ "step": 3600
930
+ },
931
+ {
932
+ "epoch": 4.83,
933
+ "learning_rate": 1.0857142857142858e-06,
934
+ "loss": 0.6095,
935
+ "step": 3625
936
+ },
937
+ {
938
+ "epoch": 4.87,
939
+ "learning_rate": 1.0142857142857144e-06,
940
+ "loss": 0.6374,
941
+ "step": 3650
942
+ },
943
+ {
944
+ "epoch": 4.9,
945
+ "learning_rate": 9.42857142857143e-07,
946
+ "loss": 0.6038,
947
+ "step": 3675
948
+ },
949
+ {
950
+ "epoch": 4.93,
951
+ "learning_rate": 8.714285714285716e-07,
952
+ "loss": 0.6349,
953
+ "step": 3700
954
+ },
955
+ {
956
+ "epoch": 4.97,
957
+ "learning_rate": 8.000000000000001e-07,
958
+ "loss": 0.6017,
959
+ "step": 3725
960
+ },
961
+ {
962
+ "epoch": 5.0,
963
+ "learning_rate": 7.285714285714287e-07,
964
+ "loss": 0.6295,
965
+ "step": 3750
966
+ },
967
+ {
968
+ "epoch": 5.03,
969
+ "learning_rate": 6.571428571428571e-07,
970
+ "loss": 0.631,
971
+ "step": 3775
972
+ },
973
+ {
974
+ "epoch": 5.07,
975
+ "learning_rate": 5.857142857142857e-07,
976
+ "loss": 0.5753,
977
+ "step": 3800
978
+ },
979
+ {
980
+ "epoch": 5.1,
981
+ "learning_rate": 5.142857142857143e-07,
982
+ "loss": 0.5888,
983
+ "step": 3825
984
+ },
985
+ {
986
+ "epoch": 5.13,
987
+ "learning_rate": 4.4285714285714286e-07,
988
+ "loss": 0.6269,
989
+ "step": 3850
990
+ },
991
+ {
992
+ "epoch": 5.17,
993
+ "learning_rate": 3.7142857142857145e-07,
994
+ "loss": 0.5584,
995
+ "step": 3875
996
+ },
997
+ {
998
+ "epoch": 5.2,
999
+ "learning_rate": 3.0000000000000004e-07,
1000
+ "loss": 0.642,
1001
+ "step": 3900
1002
+ },
1003
+ {
1004
+ "epoch": 5.23,
1005
+ "learning_rate": 2.285714285714286e-07,
1006
+ "loss": 0.561,
1007
+ "step": 3925
1008
+ },
1009
+ {
1010
+ "epoch": 5.27,
1011
+ "learning_rate": 1.5714285714285717e-07,
1012
+ "loss": 0.6469,
1013
+ "step": 3950
1014
+ },
1015
+ {
1016
+ "epoch": 5.3,
1017
+ "learning_rate": 8.571428571428573e-08,
1018
+ "loss": 0.6164,
1019
+ "step": 3975
1020
+ },
1021
+ {
1022
+ "epoch": 5.33,
1023
+ "learning_rate": 1.4285714285714288e-08,
1024
+ "loss": 0.6082,
1025
+ "step": 4000
1026
+ },
1027
+ {
1028
+ "epoch": 5.33,
1029
+ "eval_loss": 0.6078237891197205,
1030
+ "eval_runtime": 220.2328,
1031
+ "eval_samples_per_second": 22.476,
1032
+ "eval_steps_per_second": 5.621,
1033
+ "step": 4000
1034
+ },
1035
+ {
1036
+ "epoch": 5.33,
1037
+ "step": 4000,
1038
+ "total_flos": 1.6501699427328e+18,
1039
+ "train_loss": 1.4726774854660034,
1040
+ "train_runtime": 5301.1107,
1041
+ "train_samples_per_second": 12.073,
1042
+ "train_steps_per_second": 0.755
1043
+ }
1044
+ ],
1045
+ "logging_steps": 25,
1046
+ "max_steps": 4000,
1047
+ "num_input_tokens_seen": 0,
1048
+ "num_train_epochs": 6,
1049
+ "save_steps": 500,
1050
+ "total_flos": 1.6501699427328e+18,
1051
+ "train_batch_size": 4,
1052
+ "trial_name": null,
1053
+ "trial_params": null
1054
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68200f0eedc80778d9d3d214941bd827c34b44df3ff9f9306ed17f6784a87550
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:775d3f66b05e1f0b0a241c7939bfe039c39404da0b0ccec1f12b1d2afec24a2c
3
  size 4856