Gizachew commited on
Commit
691b163
1 Parent(s): 30cad70

End of training

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: xls-r-amharic
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/hallo23/huggingface/runs/5pgjd6az)
17
+ # xls-r-amharic
18
+
19
+ This model is a fine-tuned version of [ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition](https://huggingface.co/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.1237
22
+ - Accuracy: 0.9778
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-05
42
+ - train_batch_size: 4
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - gradient_accumulation_steps: 2
46
+ - total_train_batch_size: 8
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: linear
49
+ - num_epochs: 15
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
55
+ |:-------------:|:-------:|:----:|:---------------:|:--------:|
56
+ | 0.2847 | 2.0202 | 500 | 0.2479 | 0.9212 |
57
+ | 0.1138 | 4.0404 | 1000 | 0.2063 | 0.9434 |
58
+ | 0.0614 | 6.0606 | 1500 | 0.1415 | 0.9657 |
59
+ | 0.0349 | 8.0808 | 2000 | 0.1383 | 0.9737 |
60
+ | 0.0143 | 10.1010 | 2500 | 0.0901 | 0.9818 |
61
+ | 0.0178 | 12.1212 | 3000 | 0.1188 | 0.9778 |
62
+ | 0.0222 | 14.1414 | 3500 | 0.1237 | 0.9778 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - Transformers 4.41.0.dev0
68
+ - Pytorch 2.1.2
69
+ - Datasets 2.19.1.dev0
70
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 14.969696969696969,
3
+ "total_flos": 3.163398064220592e+18,
4
+ "train_loss": 0.13106194541521884,
5
+ "train_runtime": 5199.6798,
6
+ "train_samples": 1979,
7
+ "train_samples_per_second": 5.709,
8
+ "train_steps_per_second": 0.713
9
+ }
config.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
3
+ "activation_dropout": 0.05,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSpeechClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": true,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.05,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "finetuning_task": "wav2vec2_clf",
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.05,
59
+ "hidden_size": 1024,
60
+ "id2label": {
61
+ "0": "01Neutral",
62
+ "1": "02Fearful",
63
+ "2": "03Happy",
64
+ "3": "04Sad",
65
+ "4": "05Angry"
66
+ },
67
+ "initializer_range": 0.02,
68
+ "intermediate_size": 4096,
69
+ "label2id": {
70
+ "01Neutral": 0,
71
+ "02Fearful": 1,
72
+ "03Happy": 2,
73
+ "04Sad": 3,
74
+ "05Angry": 4
75
+ },
76
+ "layer_norm_eps": 1e-05,
77
+ "layerdrop": 0.05,
78
+ "mask_channel_length": 10,
79
+ "mask_channel_min_space": 1,
80
+ "mask_channel_other": 0.0,
81
+ "mask_channel_prob": 0.0,
82
+ "mask_channel_selection": "static",
83
+ "mask_feature_length": 10,
84
+ "mask_feature_min_masks": 0,
85
+ "mask_feature_prob": 0.0,
86
+ "mask_time_length": 10,
87
+ "mask_time_min_masks": 2,
88
+ "mask_time_min_space": 1,
89
+ "mask_time_other": 0.0,
90
+ "mask_time_prob": 0.05,
91
+ "mask_time_selection": "static",
92
+ "model_type": "wav2vec2",
93
+ "num_adapter_layers": 3,
94
+ "num_attention_heads": 16,
95
+ "num_codevector_groups": 2,
96
+ "num_codevectors_per_group": 320,
97
+ "num_conv_pos_embedding_groups": 16,
98
+ "num_conv_pos_embeddings": 128,
99
+ "num_feat_extract_layers": 7,
100
+ "num_hidden_layers": 24,
101
+ "num_negatives": 100,
102
+ "output_hidden_size": 1024,
103
+ "pad_token_id": 0,
104
+ "pooling_mode": "mean",
105
+ "problem_type": "single_label_classification",
106
+ "proj_codevector_dim": 256,
107
+ "tdnn_dilation": [
108
+ 1,
109
+ 2,
110
+ 3,
111
+ 1,
112
+ 1
113
+ ],
114
+ "tdnn_dim": [
115
+ 512,
116
+ 512,
117
+ 512,
118
+ 512,
119
+ 1500
120
+ ],
121
+ "tdnn_kernel": [
122
+ 5,
123
+ 3,
124
+ 3,
125
+ 1,
126
+ 1
127
+ ],
128
+ "torch_dtype": "float32",
129
+ "transformers_version": "4.41.0.dev0",
130
+ "use_weighted_layer_sum": false,
131
+ "vocab_size": 33,
132
+ "xvector_output_dim": 512
133
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d911bd0218782e553ea1cabe9b0bdaf44506550ac800220bf7f18648d7e41c59
3
+ size 1266026604
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
runs/May03_06-24-46_a6df16a6f07f/events.out.tfevents.1714717488.a6df16a6f07f.35.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b56c0d802cdf41d8000f253885489b412dd8a5ebdf1625cea719488fcf351c3c
3
+ size 17270
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 14.969696969696969,
3
+ "total_flos": 3.163398064220592e+18,
4
+ "train_loss": 0.13106194541521884,
5
+ "train_runtime": 5199.6798,
6
+ "train_samples": 1979,
7
+ "train_samples_per_second": 5.709,
8
+ "train_steps_per_second": 0.713
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.09009132534265518,
3
+ "best_model_checkpoint": "/kaggle/working/xls-r-amharic/checkpoint-2500",
4
+ "epoch": 14.969696969696969,
5
+ "eval_steps": 500,
6
+ "global_step": 3705,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.40404040404040403,
13
+ "grad_norm": 3.749224901199341,
14
+ "learning_rate": 9.730094466936572e-06,
15
+ "loss": 1.1928,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.8080808080808081,
20
+ "grad_norm": 5.272401809692383,
21
+ "learning_rate": 9.46288798920378e-06,
22
+ "loss": 0.7006,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 1.2121212121212122,
27
+ "grad_norm": 2.211305618286133,
28
+ "learning_rate": 9.192982456140351e-06,
29
+ "loss": 0.3993,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 1.6161616161616161,
34
+ "grad_norm": 0.4073326289653778,
35
+ "learning_rate": 8.923076923076925e-06,
36
+ "loss": 0.2935,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 2.0202020202020203,
41
+ "grad_norm": 10.367269515991211,
42
+ "learning_rate": 8.65587044534413e-06,
43
+ "loss": 0.2847,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 2.0202020202020203,
48
+ "eval_accuracy": 0.9212121367454529,
49
+ "eval_loss": 0.247890442609787,
50
+ "eval_runtime": 43.8986,
51
+ "eval_samples_per_second": 11.276,
52
+ "eval_steps_per_second": 2.825,
53
+ "step": 500
54
+ },
55
+ {
56
+ "epoch": 2.4242424242424243,
57
+ "grad_norm": 14.821730613708496,
58
+ "learning_rate": 8.385964912280704e-06,
59
+ "loss": 0.1788,
60
+ "step": 600
61
+ },
62
+ {
63
+ "epoch": 2.8282828282828283,
64
+ "grad_norm": 1.658502221107483,
65
+ "learning_rate": 8.116059379217275e-06,
66
+ "loss": 0.1541,
67
+ "step": 700
68
+ },
69
+ {
70
+ "epoch": 3.2323232323232323,
71
+ "grad_norm": 7.924429416656494,
72
+ "learning_rate": 7.846153846153847e-06,
73
+ "loss": 0.1683,
74
+ "step": 800
75
+ },
76
+ {
77
+ "epoch": 3.6363636363636362,
78
+ "grad_norm": 0.14956633746623993,
79
+ "learning_rate": 7.576248313090419e-06,
80
+ "loss": 0.1315,
81
+ "step": 900
82
+ },
83
+ {
84
+ "epoch": 4.040404040404041,
85
+ "grad_norm": 0.08813250064849854,
86
+ "learning_rate": 7.306342780026991e-06,
87
+ "loss": 0.1138,
88
+ "step": 1000
89
+ },
90
+ {
91
+ "epoch": 4.040404040404041,
92
+ "eval_accuracy": 0.9434343576431274,
93
+ "eval_loss": 0.20633606612682343,
94
+ "eval_runtime": 43.6554,
95
+ "eval_samples_per_second": 11.339,
96
+ "eval_steps_per_second": 2.84,
97
+ "step": 1000
98
+ },
99
+ {
100
+ "epoch": 4.444444444444445,
101
+ "grad_norm": 2.7388453483581543,
102
+ "learning_rate": 7.036437246963563e-06,
103
+ "loss": 0.1113,
104
+ "step": 1100
105
+ },
106
+ {
107
+ "epoch": 4.848484848484849,
108
+ "grad_norm": 1.112307071685791,
109
+ "learning_rate": 6.766531713900135e-06,
110
+ "loss": 0.1174,
111
+ "step": 1200
112
+ },
113
+ {
114
+ "epoch": 5.252525252525253,
115
+ "grad_norm": 0.9322103261947632,
116
+ "learning_rate": 6.496626180836708e-06,
117
+ "loss": 0.1025,
118
+ "step": 1300
119
+ },
120
+ {
121
+ "epoch": 5.656565656565657,
122
+ "grad_norm": 0.03639671951532364,
123
+ "learning_rate": 6.22672064777328e-06,
124
+ "loss": 0.0754,
125
+ "step": 1400
126
+ },
127
+ {
128
+ "epoch": 6.0606060606060606,
129
+ "grad_norm": 0.024523159489035606,
130
+ "learning_rate": 5.956815114709852e-06,
131
+ "loss": 0.0614,
132
+ "step": 1500
133
+ },
134
+ {
135
+ "epoch": 6.0606060606060606,
136
+ "eval_accuracy": 0.965656578540802,
137
+ "eval_loss": 0.1415119469165802,
138
+ "eval_runtime": 43.7636,
139
+ "eval_samples_per_second": 11.311,
140
+ "eval_steps_per_second": 2.833,
141
+ "step": 1500
142
+ },
143
+ {
144
+ "epoch": 6.4646464646464645,
145
+ "grad_norm": 0.04833903908729553,
146
+ "learning_rate": 5.686909581646424e-06,
147
+ "loss": 0.0296,
148
+ "step": 1600
149
+ },
150
+ {
151
+ "epoch": 6.8686868686868685,
152
+ "grad_norm": 0.018174033612012863,
153
+ "learning_rate": 5.417004048582997e-06,
154
+ "loss": 0.086,
155
+ "step": 1700
156
+ },
157
+ {
158
+ "epoch": 7.2727272727272725,
159
+ "grad_norm": 0.2865201532840729,
160
+ "learning_rate": 5.147098515519568e-06,
161
+ "loss": 0.0671,
162
+ "step": 1800
163
+ },
164
+ {
165
+ "epoch": 7.6767676767676765,
166
+ "grad_norm": 0.0348142571747303,
167
+ "learning_rate": 4.877192982456141e-06,
168
+ "loss": 0.0512,
169
+ "step": 1900
170
+ },
171
+ {
172
+ "epoch": 8.080808080808081,
173
+ "grad_norm": 0.2693362832069397,
174
+ "learning_rate": 4.607287449392713e-06,
175
+ "loss": 0.0349,
176
+ "step": 2000
177
+ },
178
+ {
179
+ "epoch": 8.080808080808081,
180
+ "eval_accuracy": 0.973737359046936,
181
+ "eval_loss": 0.13826610147953033,
182
+ "eval_runtime": 43.9764,
183
+ "eval_samples_per_second": 11.256,
184
+ "eval_steps_per_second": 2.82,
185
+ "step": 2000
186
+ },
187
+ {
188
+ "epoch": 8.484848484848484,
189
+ "grad_norm": 0.033031389117240906,
190
+ "learning_rate": 4.337381916329285e-06,
191
+ "loss": 0.0251,
192
+ "step": 2100
193
+ },
194
+ {
195
+ "epoch": 8.88888888888889,
196
+ "grad_norm": 0.13385087251663208,
197
+ "learning_rate": 4.067476383265857e-06,
198
+ "loss": 0.0367,
199
+ "step": 2200
200
+ },
201
+ {
202
+ "epoch": 9.292929292929292,
203
+ "grad_norm": 41.249717712402344,
204
+ "learning_rate": 3.7975708502024296e-06,
205
+ "loss": 0.0501,
206
+ "step": 2300
207
+ },
208
+ {
209
+ "epoch": 9.696969696969697,
210
+ "grad_norm": 0.02456289902329445,
211
+ "learning_rate": 3.527665317139002e-06,
212
+ "loss": 0.0555,
213
+ "step": 2400
214
+ },
215
+ {
216
+ "epoch": 10.1010101010101,
217
+ "grad_norm": 0.010708549991250038,
218
+ "learning_rate": 3.2577597840755737e-06,
219
+ "loss": 0.0143,
220
+ "step": 2500
221
+ },
222
+ {
223
+ "epoch": 10.1010101010101,
224
+ "eval_accuracy": 0.9818181991577148,
225
+ "eval_loss": 0.09009132534265518,
226
+ "eval_runtime": 44.1262,
227
+ "eval_samples_per_second": 11.218,
228
+ "eval_steps_per_second": 2.81,
229
+ "step": 2500
230
+ },
231
+ {
232
+ "epoch": 10.505050505050505,
233
+ "grad_norm": 0.012150867842137814,
234
+ "learning_rate": 2.9905533063427807e-06,
235
+ "loss": 0.0486,
236
+ "step": 2600
237
+ },
238
+ {
239
+ "epoch": 10.909090909090908,
240
+ "grad_norm": 0.008297057822346687,
241
+ "learning_rate": 2.7206477732793525e-06,
242
+ "loss": 0.0349,
243
+ "step": 2700
244
+ },
245
+ {
246
+ "epoch": 11.313131313131313,
247
+ "grad_norm": 0.013805734924972057,
248
+ "learning_rate": 2.4507422402159244e-06,
249
+ "loss": 0.0214,
250
+ "step": 2800
251
+ },
252
+ {
253
+ "epoch": 11.717171717171716,
254
+ "grad_norm": 0.011365755461156368,
255
+ "learning_rate": 2.180836707152497e-06,
256
+ "loss": 0.0229,
257
+ "step": 2900
258
+ },
259
+ {
260
+ "epoch": 12.121212121212121,
261
+ "grad_norm": 0.02992076426744461,
262
+ "learning_rate": 1.910931174089069e-06,
263
+ "loss": 0.0178,
264
+ "step": 3000
265
+ },
266
+ {
267
+ "epoch": 12.121212121212121,
268
+ "eval_accuracy": 0.9777777791023254,
269
+ "eval_loss": 0.1187622994184494,
270
+ "eval_runtime": 43.9086,
271
+ "eval_samples_per_second": 11.273,
272
+ "eval_steps_per_second": 2.824,
273
+ "step": 3000
274
+ },
275
+ {
276
+ "epoch": 12.525252525252526,
277
+ "grad_norm": 0.09338176250457764,
278
+ "learning_rate": 1.6410256410256412e-06,
279
+ "loss": 0.0252,
280
+ "step": 3100
281
+ },
282
+ {
283
+ "epoch": 12.929292929292929,
284
+ "grad_norm": 0.27980300784111023,
285
+ "learning_rate": 1.3738191632928477e-06,
286
+ "loss": 0.0387,
287
+ "step": 3200
288
+ },
289
+ {
290
+ "epoch": 13.333333333333334,
291
+ "grad_norm": 0.007048506755381823,
292
+ "learning_rate": 1.1039136302294197e-06,
293
+ "loss": 0.0093,
294
+ "step": 3300
295
+ },
296
+ {
297
+ "epoch": 13.737373737373737,
298
+ "grad_norm": 0.00685643358156085,
299
+ "learning_rate": 8.34008097165992e-07,
300
+ "loss": 0.0222,
301
+ "step": 3400
302
+ },
303
+ {
304
+ "epoch": 14.141414141414142,
305
+ "grad_norm": 0.14381413161754608,
306
+ "learning_rate": 5.641025641025642e-07,
307
+ "loss": 0.0222,
308
+ "step": 3500
309
+ },
310
+ {
311
+ "epoch": 14.141414141414142,
312
+ "eval_accuracy": 0.9777777791023254,
313
+ "eval_loss": 0.12370182573795319,
314
+ "eval_runtime": 44.0309,
315
+ "eval_samples_per_second": 11.242,
316
+ "eval_steps_per_second": 2.816,
317
+ "step": 3500
318
+ },
319
+ {
320
+ "epoch": 14.545454545454545,
321
+ "grad_norm": 0.015089770779013634,
322
+ "learning_rate": 2.941970310391363e-07,
323
+ "loss": 0.0304,
324
+ "step": 3600
325
+ },
326
+ {
327
+ "epoch": 14.94949494949495,
328
+ "grad_norm": 1.67782723903656,
329
+ "learning_rate": 2.4291497975708507e-08,
330
+ "loss": 0.0173,
331
+ "step": 3700
332
+ },
333
+ {
334
+ "epoch": 14.969696969696969,
335
+ "step": 3705,
336
+ "total_flos": 3.163398064220592e+18,
337
+ "train_loss": 0.13106194541521884,
338
+ "train_runtime": 5199.6798,
339
+ "train_samples_per_second": 5.709,
340
+ "train_steps_per_second": 0.713
341
+ }
342
+ ],
343
+ "logging_steps": 100,
344
+ "max_steps": 3705,
345
+ "num_input_tokens_seen": 0,
346
+ "num_train_epochs": 15,
347
+ "save_steps": 500,
348
+ "stateful_callbacks": {
349
+ "EarlyStoppingCallback": {
350
+ "args": {
351
+ "early_stopping_patience": 3,
352
+ "early_stopping_threshold": 0.0
353
+ },
354
+ "attributes": {
355
+ "early_stopping_patience_counter": 0
356
+ }
357
+ },
358
+ "TrainerControl": {
359
+ "args": {
360
+ "should_epoch_stop": false,
361
+ "should_evaluate": false,
362
+ "should_log": false,
363
+ "should_save": true,
364
+ "should_training_stop": false
365
+ },
366
+ "attributes": {}
367
+ }
368
+ },
369
+ "total_flos": 3.163398064220592e+18,
370
+ "train_batch_size": 4,
371
+ "trial_name": null,
372
+ "trial_params": null
373
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b13ee671522b5df37fb1d01d3fa7843a6541ac54f9e525ca78713c402dc00ab7
3
+ size 5112