clean up the repo
Browse files- 5gram.arpa +0 -0
- 5gram.bin +0 -3
- 5gram.txt +0 -0
- README.md.0 +0 -129
- arg.txt +0 -34
- er2 +0 -259
- err +0 -214
- ngram.py +0 -25
- test-vocab.py +0 -22
- wav2vec2-base-turkish +0 -1
5gram.arpa
DELETED
The diff for this file is too large to render.
See raw diff
|
|
5gram.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7a76859c96afc4fa223dc7e5cb4d000926ec82f25ffbf560afec88ad39ed8783
|
3 |
-
size 1831539
|
|
|
|
|
|
|
|
5gram.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
README.md.0
DELETED
@@ -1,129 +0,0 @@
|
|
1 |
-
---
|
2 |
-
language: tr
|
3 |
-
datasets:
|
4 |
-
- common_voice
|
5 |
-
metrics:
|
6 |
-
- wer
|
7 |
-
tags:
|
8 |
-
- audio
|
9 |
-
- automatic-speech-recognition
|
10 |
-
- speech
|
11 |
-
- common_voice
|
12 |
-
- generated_from_trainer
|
13 |
-
- tr
|
14 |
-
- robust-speech-event
|
15 |
-
license: apache-2.0
|
16 |
-
model-index:
|
17 |
-
- name: Wav2Vec2 Base Turkish by Cahya
|
18 |
-
results:
|
19 |
-
- task:
|
20 |
-
name: Speech Recognition
|
21 |
-
type: automatic-speech-recognition
|
22 |
-
dataset:
|
23 |
-
name: Common Voice tr
|
24 |
-
type: common_voice
|
25 |
-
args: tr
|
26 |
-
metrics:
|
27 |
-
- name: Test WER
|
28 |
-
type: wer
|
29 |
-
value: 13.70
|
30 |
-
---
|
31 |
-
|
32 |
-
# Wav2Vec2-Large-XLSR-Turkish
|
33 |
-
|
34 |
-
This is the model for Wav2Vec2-Base-Turkish-Artificial-CV, a fine-tuned
|
35 |
-
[cahya/wav2vec2-base-turkish-artificial](https://huggingface.co/cahya/wav2vec2-base-turkish-artificial)
|
36 |
-
model on [Turkish Common Voice dataset](https://huggingface.co/datasets/common_voice).
|
37 |
-
|
38 |
-
When using this model, make sure that your speech input is sampled at 16kHz.
|
39 |
-
|
40 |
-
## Usage
|
41 |
-
The model can be used directly (without a language model) as follows:
|
42 |
-
```python
|
43 |
-
import torch
|
44 |
-
import torchaudio
|
45 |
-
from datasets import load_dataset
|
46 |
-
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
47 |
-
|
48 |
-
test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
|
49 |
-
|
50 |
-
processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
|
51 |
-
model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
|
52 |
-
|
53 |
-
|
54 |
-
# Preprocessing the datasets.
|
55 |
-
# We need to read the aduio files as arrays
|
56 |
-
def speech_file_to_array_fn(batch):
|
57 |
-
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
58 |
-
resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
|
59 |
-
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
60 |
-
return batch
|
61 |
-
|
62 |
-
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
63 |
-
inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
64 |
-
|
65 |
-
with torch.no_grad():
|
66 |
-
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
67 |
-
|
68 |
-
predicted_ids = torch.argmax(logits, dim=-1)
|
69 |
-
|
70 |
-
print("Prediction:", processor.batch_decode(predicted_ids))
|
71 |
-
print("Reference:", test_dataset[:2]["sentence"])
|
72 |
-
```
|
73 |
-
|
74 |
-
|
75 |
-
## Evaluation
|
76 |
-
|
77 |
-
The model can be evaluated as follows on the Turkish test data of Common Voice.
|
78 |
-
|
79 |
-
```python
|
80 |
-
import torch
|
81 |
-
import torchaudio
|
82 |
-
from datasets import load_dataset, load_metric
|
83 |
-
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
84 |
-
import re
|
85 |
-
|
86 |
-
test_dataset = load_dataset("common_voice", "tr", split="test")
|
87 |
-
wer = load_metric("wer")
|
88 |
-
|
89 |
-
processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
|
90 |
-
model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
|
91 |
-
model.to("cuda")
|
92 |
-
|
93 |
-
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\‘\”\'\`…\’»«]'
|
94 |
-
|
95 |
-
# Preprocessing the datasets.
|
96 |
-
# We need to read the aduio files as arrays
|
97 |
-
def speech_file_to_array_fn(batch):
|
98 |
-
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
99 |
-
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
100 |
-
resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
|
101 |
-
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
102 |
-
return batch
|
103 |
-
|
104 |
-
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
105 |
-
|
106 |
-
# Preprocessing the datasets.
|
107 |
-
# We need to read the aduio files as arrays
|
108 |
-
def evaluate(batch):
|
109 |
-
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
110 |
-
|
111 |
-
with torch.no_grad():
|
112 |
-
logits = model(inputs.input_values.to("cuda")).logits
|
113 |
-
|
114 |
-
pred_ids = torch.argmax(logits, dim=-1)
|
115 |
-
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
116 |
-
return batch
|
117 |
-
|
118 |
-
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
119 |
-
|
120 |
-
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
121 |
-
```
|
122 |
-
|
123 |
-
**Test Result**: 13.70 %
|
124 |
-
|
125 |
-
## Training
|
126 |
-
|
127 |
-
The Common Voice `train`, `validation`, other and invalidated
|
128 |
-
|
129 |
-
The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
arg.txt
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
--dataset_name="common_voice"
|
2 |
-
--model_name_or_path="cahya/wav2vec2-base-turkish-artificial-cv"
|
3 |
-
--dataset_config_name="tr"
|
4 |
-
--output_dir="./output"
|
5 |
-
--overwrite_output_dir
|
6 |
-
--num_train_epochs="1"
|
7 |
-
--per_device_train_batch_size="2"
|
8 |
-
--per_device_eval_batch_size="2"
|
9 |
-
--gradient_accumulation_steps="4"
|
10 |
-
--learning_rate="7.5e-7"
|
11 |
-
--warmup_steps="2000"
|
12 |
-
--length_column_name="input_length"
|
13 |
-
--evaluation_strategy="steps"
|
14 |
-
--text_column_name="sentence"
|
15 |
-
--save_steps="500"
|
16 |
-
--eval_steps="500"
|
17 |
-
--logging_steps="100"
|
18 |
-
--layerdrop="0.0"
|
19 |
-
--activation_dropout="0.1"
|
20 |
-
--save_total_limit="3"
|
21 |
-
--freeze_feature_encoder
|
22 |
-
--feat_proj_dropout="0.0"
|
23 |
-
--mask_time_prob="0.75"
|
24 |
-
--mask_time_length="10"
|
25 |
-
--mask_feature_prob="0.25"
|
26 |
-
--mask_feature_length="64"
|
27 |
-
--gradient_checkpointing
|
28 |
-
--use_auth_token
|
29 |
-
--fp16=false
|
30 |
-
--group_by_length
|
31 |
-
--do_train=true
|
32 |
-
--do_eval=true
|
33 |
-
--push_to_hub
|
34 |
-
--chars_to_ignore , ? . ! \; \: \"\" \% \' \" \' \' \` … \’ » « \‘ '“' '”' � é û
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
er2
DELETED
@@ -1,259 +0,0 @@
|
|
1 |
-
loading configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/config.json from cache at /home/cahya/.cache/huggingface/transformers/47f005d7b541562c0734cfe1b8aaf7f644846084b33a9247f5810d5a16d001a7.1c2175954f7220a41c71683d239699eb295d40ec92ac51faac3b85ad4bef2ad8
|
2 |
-
/home/cahya/Work/MachineLearning/transformers/src/transformers/configuration_utils.py:353: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.
|
3 |
-
warnings.warn(
|
4 |
-
Model config Wav2Vec2Config {
|
5 |
-
"_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv",
|
6 |
-
"activation_dropout": 0.055,
|
7 |
-
"adapter_kernel_size": 3,
|
8 |
-
"adapter_stride": 2,
|
9 |
-
"add_adapter": false,
|
10 |
-
"apply_spec_augment": true,
|
11 |
-
"architectures": [
|
12 |
-
"Wav2Vec2ForCTC"
|
13 |
-
],
|
14 |
-
"attention_dropout": 0.094,
|
15 |
-
"bos_token_id": 1,
|
16 |
-
"classifier_proj_size": 256,
|
17 |
-
"codevector_dim": 256,
|
18 |
-
"contrastive_logits_temperature": 0.1,
|
19 |
-
"conv_bias": false,
|
20 |
-
"conv_dim": [
|
21 |
-
512,
|
22 |
-
512,
|
23 |
-
512,
|
24 |
-
512,
|
25 |
-
512,
|
26 |
-
512,
|
27 |
-
512
|
28 |
-
],
|
29 |
-
"conv_kernel": [
|
30 |
-
10,
|
31 |
-
3,
|
32 |
-
3,
|
33 |
-
3,
|
34 |
-
3,
|
35 |
-
2,
|
36 |
-
2
|
37 |
-
],
|
38 |
-
"conv_stride": [
|
39 |
-
5,
|
40 |
-
2,
|
41 |
-
2,
|
42 |
-
2,
|
43 |
-
2,
|
44 |
-
2,
|
45 |
-
2
|
46 |
-
],
|
47 |
-
"ctc_loss_reduction": "mean",
|
48 |
-
"ctc_zero_infinity": true,
|
49 |
-
"diversity_loss_weight": 0.1,
|
50 |
-
"do_stable_layer_norm": false,
|
51 |
-
"eos_token_id": 2,
|
52 |
-
"feat_extract_activation": "gelu",
|
53 |
-
"feat_extract_norm": "group",
|
54 |
-
"feat_proj_dropout": 0.04,
|
55 |
-
"feat_quantizer_dropout": 0.0,
|
56 |
-
"final_dropout": 0.1,
|
57 |
-
"gradient_checkpointing": true,
|
58 |
-
"hidden_act": "gelu",
|
59 |
-
"hidden_dropout": 0.047,
|
60 |
-
"hidden_size": 768,
|
61 |
-
"initializer_range": 0.02,
|
62 |
-
"intermediate_size": 3072,
|
63 |
-
"layer_norm_eps": 1e-05,
|
64 |
-
"layerdrop": 0.041,
|
65 |
-
"mask_feature_length": 10,
|
66 |
-
"mask_feature_min_masks": 0,
|
67 |
-
"mask_feature_prob": 0.0,
|
68 |
-
"mask_time_length": 10,
|
69 |
-
"mask_time_min_masks": 2,
|
70 |
-
"mask_time_prob": 0.4,
|
71 |
-
"model_type": "wav2vec2",
|
72 |
-
"num_adapter_layers": 3,
|
73 |
-
"num_attention_heads": 12,
|
74 |
-
"num_codevector_groups": 2,
|
75 |
-
"num_codevectors_per_group": 320,
|
76 |
-
"num_conv_pos_embedding_groups": 16,
|
77 |
-
"num_conv_pos_embeddings": 128,
|
78 |
-
"num_feat_extract_layers": 7,
|
79 |
-
"num_hidden_layers": 12,
|
80 |
-
"num_negatives": 100,
|
81 |
-
"output_hidden_size": 768,
|
82 |
-
"pad_token_id": 39,
|
83 |
-
"proj_codevector_dim": 256,
|
84 |
-
"tdnn_dilation": [
|
85 |
-
1,
|
86 |
-
2,
|
87 |
-
3,
|
88 |
-
1,
|
89 |
-
1
|
90 |
-
],
|
91 |
-
"tdnn_dim": [
|
92 |
-
512,
|
93 |
-
512,
|
94 |
-
512,
|
95 |
-
512,
|
96 |
-
1500
|
97 |
-
],
|
98 |
-
"tdnn_kernel": [
|
99 |
-
5,
|
100 |
-
3,
|
101 |
-
3,
|
102 |
-
1,
|
103 |
-
1
|
104 |
-
],
|
105 |
-
"transformers_version": "4.17.0.dev0",
|
106 |
-
"use_weighted_layer_sum": false,
|
107 |
-
"vocab_size": 40,
|
108 |
-
"xvector_output_dim": 512
|
109 |
-
}
|
110 |
-
|
111 |
-
|
112 |
0%| | 0/1 [00:00<?, ?ba/s]
|
113 |
-
|
114 |
0%| | 0/1 [00:00<?, ?ba/s]
|
115 |
-
Didn't find file ./output/tokenizer_config.json. We won't load it.
|
116 |
-
Didn't find file ./output/added_tokens.json. We won't load it.
|
117 |
-
Didn't find file ./output/special_tokens_map.json. We won't load it.
|
118 |
-
Didn't find file ./output/tokenizer.json. We won't load it.
|
119 |
-
loading file ./output/vocab.json
|
120 |
-
loading file None
|
121 |
-
loading file None
|
122 |
-
loading file None
|
123 |
-
loading file None
|
124 |
-
file ./output/config.json not found
|
125 |
-
Adding <s> to the vocabulary
|
126 |
-
Adding </s> to the vocabulary
|
127 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
128 |
-
loading configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/config.json from cache at /home/cahya/.cache/huggingface/transformers/47f005d7b541562c0734cfe1b8aaf7f644846084b33a9247f5810d5a16d001a7.1c2175954f7220a41c71683d239699eb295d40ec92ac51faac3b85ad4bef2ad8
|
129 |
-
Model config Wav2Vec2Config {
|
130 |
-
"_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv",
|
131 |
-
"activation_dropout": 0.055,
|
132 |
-
"adapter_kernel_size": 3,
|
133 |
-
"adapter_stride": 2,
|
134 |
-
"add_adapter": false,
|
135 |
-
"apply_spec_augment": true,
|
136 |
-
"architectures": [
|
137 |
-
"Wav2Vec2ForCTC"
|
138 |
-
],
|
139 |
-
"attention_dropout": 0.094,
|
140 |
-
"bos_token_id": 1,
|
141 |
-
"classifier_proj_size": 256,
|
142 |
-
"codevector_dim": 256,
|
143 |
-
"contrastive_logits_temperature": 0.1,
|
144 |
-
"conv_bias": false,
|
145 |
-
"conv_dim": [
|
146 |
-
512,
|
147 |
-
512,
|
148 |
-
512,
|
149 |
-
512,
|
150 |
-
512,
|
151 |
-
512,
|
152 |
-
512
|
153 |
-
],
|
154 |
-
"conv_kernel": [
|
155 |
-
10,
|
156 |
-
3,
|
157 |
-
3,
|
158 |
-
3,
|
159 |
-
3,
|
160 |
-
2,
|
161 |
-
2
|
162 |
-
],
|
163 |
-
"conv_stride": [
|
164 |
-
5,
|
165 |
-
2,
|
166 |
-
2,
|
167 |
-
2,
|
168 |
-
2,
|
169 |
-
2,
|
170 |
-
2
|
171 |
-
],
|
172 |
-
"ctc_loss_reduction": "mean",
|
173 |
-
"ctc_zero_infinity": true,
|
174 |
-
"diversity_loss_weight": 0.1,
|
175 |
-
"do_stable_layer_norm": false,
|
176 |
-
"eos_token_id": 2,
|
177 |
-
"feat_extract_activation": "gelu",
|
178 |
-
"feat_extract_norm": "group",
|
179 |
-
"feat_proj_dropout": 0.04,
|
180 |
-
"feat_quantizer_dropout": 0.0,
|
181 |
-
"final_dropout": 0.1,
|
182 |
-
"gradient_checkpointing": true,
|
183 |
-
"hidden_act": "gelu",
|
184 |
-
"hidden_dropout": 0.047,
|
185 |
-
"hidden_size": 768,
|
186 |
-
"initializer_range": 0.02,
|
187 |
-
"intermediate_size": 3072,
|
188 |
-
"layer_norm_eps": 1e-05,
|
189 |
-
"layerdrop": 0.041,
|
190 |
-
"mask_feature_length": 10,
|
191 |
-
"mask_feature_min_masks": 0,
|
192 |
-
"mask_feature_prob": 0.0,
|
193 |
-
"mask_time_length": 10,
|
194 |
-
"mask_time_min_masks": 2,
|
195 |
-
"mask_time_prob": 0.4,
|
196 |
-
"model_type": "wav2vec2",
|
197 |
-
"num_adapter_layers": 3,
|
198 |
-
"num_attention_heads": 12,
|
199 |
-
"num_codevector_groups": 2,
|
200 |
-
"num_codevectors_per_group": 320,
|
201 |
-
"num_conv_pos_embedding_groups": 16,
|
202 |
-
"num_conv_pos_embeddings": 128,
|
203 |
-
"num_feat_extract_layers": 7,
|
204 |
-
"num_hidden_layers": 12,
|
205 |
-
"num_negatives": 100,
|
206 |
-
"output_hidden_size": 768,
|
207 |
-
"pad_token_id": 39,
|
208 |
-
"proj_codevector_dim": 256,
|
209 |
-
"tdnn_dilation": [
|
210 |
-
1,
|
211 |
-
2,
|
212 |
-
3,
|
213 |
-
1,
|
214 |
-
1
|
215 |
-
],
|
216 |
-
"tdnn_dim": [
|
217 |
-
512,
|
218 |
-
512,
|
219 |
-
512,
|
220 |
-
512,
|
221 |
-
1500
|
222 |
-
],
|
223 |
-
"tdnn_kernel": [
|
224 |
-
5,
|
225 |
-
3,
|
226 |
-
3,
|
227 |
-
1,
|
228 |
-
1
|
229 |
-
],
|
230 |
-
"transformers_version": "4.17.0.dev0",
|
231 |
-
"use_weighted_layer_sum": false,
|
232 |
-
"vocab_size": 40,
|
233 |
-
"xvector_output_dim": 512
|
234 |
-
}
|
235 |
-
|
236 |
-
loading feature extractor configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/preprocessor_config.json from cache at /home/cahya/.cache/huggingface/transformers/34433162acde7e1ca4a265d8ae309442e4ddadff37e6e37d2d37eb7133f65f8f.fcd266b775b7f33ba9b607a0fee7cc615aeb2eb281586f046280492ea380ae23
|
237 |
-
Feature extractor Wav2Vec2FeatureExtractor {
|
238 |
-
"do_normalize": true,
|
239 |
-
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
240 |
-
"feature_size": 1,
|
241 |
-
"padding_side": "right",
|
242 |
-
"padding_value": 0.0,
|
243 |
-
"return_attention_mask": true,
|
244 |
-
"sampling_rate": 16000
|
245 |
-
}
|
246 |
-
|
247 |
-
loading weights file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/pytorch_model.bin from cache at /home/cahya/.cache/huggingface/transformers/3b3f7d0041c2b08b031c8357e39249bdbc06c8bfcd5a9f8891c7f259b07a0b85.356b4eec0d55a5c4d2d480c2dd2ea2cc0c867771bc39b8cdc97b629e4206482c
|
248 |
-
Traceback (most recent call last):
|
249 |
-
File "run_speech_recognition_ctc.py", line 745, in <module>
|
250 |
-
main()
|
251 |
-
File "run_speech_recognition_ctc.py", line 552, in main
|
252 |
-
model = AutoModelForCTC.from_pretrained(
|
253 |
-
File "/home/cahya/Work/MachineLearning/transformers/src/transformers/models/auto/auto_factory.py", line 447, in from_pretrained
|
254 |
-
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
|
255 |
-
File "/home/cahya/Work/MachineLearning/transformers/src/transformers/modeling_utils.py", line 1528, in from_pretrained
|
256 |
-
model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_state_dict_into_model(
|
257 |
-
File "/home/cahya/Work/MachineLearning/transformers/src/transformers/modeling_utils.py", line 1682, in _load_state_dict_into_model
|
258 |
-
raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
|
259 |
-
RuntimeError: Error(s) in loading state_dict for Wav2Vec2ForCTC:
|
260 |
-
size mismatch for lm_head.weight: copying a param with shape torch.Size([40, 768]) from checkpoint, the shape in current model is torch.Size([41, 768]).
|
261 |
-
size mismatch for lm_head.bias: copying a param with shape torch.Size([40]) from checkpoint, the shape in current model is torch.Size([41]).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0 |
0%| | 0/1 [00:00<?, ?ba/s]
|
|
|
1 |
0%| | 0/1 [00:00<?, ?ba/s]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
err
DELETED
@@ -1,214 +0,0 @@
|
|
1 |
-
training_args.do_train: True
|
2 |
-
01/28/2022 11:13:09 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False
|
3 |
-
01/28/2022 11:13:09 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
|
4 |
-
_n_gpu=1,
|
5 |
-
adafactor=False,
|
6 |
-
adam_beta1=0.9,
|
7 |
-
adam_beta2=0.999,
|
8 |
-
adam_epsilon=1e-08,
|
9 |
-
bf16=False,
|
10 |
-
bf16_full_eval=False,
|
11 |
-
dataloader_drop_last=False,
|
12 |
-
dataloader_num_workers=0,
|
13 |
-
dataloader_pin_memory=True,
|
14 |
-
ddp_bucket_cap_mb=None,
|
15 |
-
ddp_find_unused_parameters=None,
|
16 |
-
debug=[],
|
17 |
-
deepspeed=None,
|
18 |
-
disable_tqdm=False,
|
19 |
-
do_eval=True,
|
20 |
-
do_predict=False,
|
21 |
-
do_train=True,
|
22 |
-
eval_accumulation_steps=None,
|
23 |
-
eval_steps=500,
|
24 |
-
evaluation_strategy=IntervalStrategy.STEPS,
|
25 |
-
fp16=False,
|
26 |
-
fp16_backend=auto,
|
27 |
-
fp16_full_eval=False,
|
28 |
-
fp16_opt_level=O1,
|
29 |
-
gradient_accumulation_steps=4,
|
30 |
-
gradient_checkpointing=True,
|
31 |
-
greater_is_better=None,
|
32 |
-
group_by_length=True,
|
33 |
-
half_precision_backend=auto,
|
34 |
-
hub_model_id=None,
|
35 |
-
hub_strategy=HubStrategy.EVERY_SAVE,
|
36 |
-
hub_token=<HUB_TOKEN>,
|
37 |
-
ignore_data_skip=False,
|
38 |
-
label_names=None,
|
39 |
-
label_smoothing_factor=0.0,
|
40 |
-
learning_rate=7.5e-07,
|
41 |
-
length_column_name=input_length,
|
42 |
-
load_best_model_at_end=False,
|
43 |
-
local_rank=-1,
|
44 |
-
log_level=-1,
|
45 |
-
log_level_replica=-1,
|
46 |
-
log_on_each_node=True,
|
47 |
-
logging_dir=./output/runs/Jan28_11-13-09_arjuna,
|
48 |
-
logging_first_step=False,
|
49 |
-
logging_nan_inf_filter=True,
|
50 |
-
logging_steps=100,
|
51 |
-
logging_strategy=IntervalStrategy.STEPS,
|
52 |
-
lr_scheduler_type=SchedulerType.LINEAR,
|
53 |
-
max_grad_norm=1.0,
|
54 |
-
max_steps=-1,
|
55 |
-
metric_for_best_model=None,
|
56 |
-
mp_parameters=,
|
57 |
-
no_cuda=False,
|
58 |
-
num_train_epochs=1.0,
|
59 |
-
optim=OptimizerNames.ADAMW_HF,
|
60 |
-
output_dir=./output,
|
61 |
-
overwrite_output_dir=True,
|
62 |
-
past_index=-1,
|
63 |
-
per_device_eval_batch_size=2,
|
64 |
-
per_device_train_batch_size=2,
|
65 |
-
prediction_loss_only=False,
|
66 |
-
push_to_hub=True,
|
67 |
-
push_to_hub_model_id=None,
|
68 |
-
push_to_hub_organization=None,
|
69 |
-
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
70 |
-
remove_unused_columns=True,
|
71 |
-
report_to=['tensorboard'],
|
72 |
-
resume_from_checkpoint=None,
|
73 |
-
run_name=./output,
|
74 |
-
save_on_each_node=False,
|
75 |
-
save_steps=500,
|
76 |
-
save_strategy=IntervalStrategy.STEPS,
|
77 |
-
save_total_limit=3,
|
78 |
-
seed=42,
|
79 |
-
sharded_ddp=[],
|
80 |
-
skip_memory_metrics=True,
|
81 |
-
tf32=None,
|
82 |
-
tpu_metrics_debug=False,
|
83 |
-
tpu_num_cores=None,
|
84 |
-
use_legacy_prediction_loop=False,
|
85 |
-
warmup_ratio=0.0,
|
86 |
-
warmup_steps=2000,
|
87 |
-
weight_decay=0.0,
|
88 |
-
xpu_backend=None,
|
89 |
-
)
|
90 |
-
do_train: True
|
91 |
-
load train
|
92 |
-
01/28/2022 11:13:09 - WARNING - datasets.builder - Reusing dataset common_voice (/home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)
|
93 |
-
01/28/2022 11:13:10 - WARNING - datasets.builder - Reusing dataset common_voice (/home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)
|
94 |
-
char ignored: [',', '?', '.', '!', ';', ':', '""', '%', "'", '"', "'", "'", '`', '…', '’', '»', '«', '‘', '“', '”', '�', 'é', 'û'] [,?.!;:""%'"''`…’»«‘“”�éû]
|
95 |
-
01/28/2022 11:13:10 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-a0df3a81748e62dd.arrow
|
96 |
-
01/28/2022 11:13:10 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-859966f17c7349fb.arrow
|
97 |
-
config: Wav2Vec2Config {
|
98 |
-
"_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv",
|
99 |
-
"activation_dropout": 0.055,
|
100 |
-
"adapter_kernel_size": 3,
|
101 |
-
"adapter_stride": 2,
|
102 |
-
"add_adapter": false,
|
103 |
-
"apply_spec_augment": true,
|
104 |
-
"architectures": [
|
105 |
-
"Wav2Vec2ForCTC"
|
106 |
-
],
|
107 |
-
"attention_dropout": 0.094,
|
108 |
-
"bos_token_id": 1,
|
109 |
-
"classifier_proj_size": 256,
|
110 |
-
"codevector_dim": 256,
|
111 |
-
"contrastive_logits_temperature": 0.1,
|
112 |
-
"conv_bias": false,
|
113 |
-
"conv_dim": [
|
114 |
-
512,
|
115 |
-
512,
|
116 |
-
512,
|
117 |
-
512,
|
118 |
-
512,
|
119 |
-
512,
|
120 |
-
512
|
121 |
-
],
|
122 |
-
"conv_kernel": [
|
123 |
-
10,
|
124 |
-
3,
|
125 |
-
3,
|
126 |
-
3,
|
127 |
-
3,
|
128 |
-
2,
|
129 |
-
2
|
130 |
-
],
|
131 |
-
"conv_stride": [
|
132 |
-
5,
|
133 |
-
2,
|
134 |
-
2,
|
135 |
-
2,
|
136 |
-
2,
|
137 |
-
2,
|
138 |
-
2
|
139 |
-
],
|
140 |
-
"ctc_loss_reduction": "mean",
|
141 |
-
"ctc_zero_infinity": true,
|
142 |
-
"diversity_loss_weight": 0.1,
|
143 |
-
"do_stable_layer_norm": false,
|
144 |
-
"eos_token_id": 2,
|
145 |
-
"feat_extract_activation": "gelu",
|
146 |
-
"feat_extract_norm": "group",
|
147 |
-
"feat_proj_dropout": 0.04,
|
148 |
-
"feat_quantizer_dropout": 0.0,
|
149 |
-
"final_dropout": 0.1,
|
150 |
-
"gradient_checkpointing": true,
|
151 |
-
"hidden_act": "gelu",
|
152 |
-
"hidden_dropout": 0.047,
|
153 |
-
"hidden_size": 768,
|
154 |
-
"initializer_range": 0.02,
|
155 |
-
"intermediate_size": 3072,
|
156 |
-
"layer_norm_eps": 1e-05,
|
157 |
-
"layerdrop": 0.041,
|
158 |
-
"mask_feature_length": 10,
|
159 |
-
"mask_feature_min_masks": 0,
|
160 |
-
"mask_feature_prob": 0.0,
|
161 |
-
"mask_time_length": 10,
|
162 |
-
"mask_time_min_masks": 2,
|
163 |
-
"mask_time_prob": 0.4,
|
164 |
-
"model_type": "wav2vec2",
|
165 |
-
"num_adapter_layers": 3,
|
166 |
-
"num_attention_heads": 12,
|
167 |
-
"num_codevector_groups": 2,
|
168 |
-
"num_codevectors_per_group": 320,
|
169 |
-
"num_conv_pos_embedding_groups": 16,
|
170 |
-
"num_conv_pos_embeddings": 128,
|
171 |
-
"num_feat_extract_layers": 7,
|
172 |
-
"num_hidden_layers": 12,
|
173 |
-
"num_negatives": 100,
|
174 |
-
"output_hidden_size": 768,
|
175 |
-
"pad_token_id": 39,
|
176 |
-
"proj_codevector_dim": 256,
|
177 |
-
"tdnn_dilation": [
|
178 |
-
1,
|
179 |
-
2,
|
180 |
-
3,
|
181 |
-
1,
|
182 |
-
1
|
183 |
-
],
|
184 |
-
"tdnn_dim": [
|
185 |
-
512,
|
186 |
-
512,
|
187 |
-
512,
|
188 |
-
512,
|
189 |
-
1500
|
190 |
-
],
|
191 |
-
"tdnn_kernel": [
|
192 |
-
5,
|
193 |
-
3,
|
194 |
-
3,
|
195 |
-
1,
|
196 |
-
1
|
197 |
-
],
|
198 |
-
"transformers_version": "4.17.0.dev0",
|
199 |
-
"use_weighted_layer_sum": false,
|
200 |
-
"vocab_size": 40,
|
201 |
-
"xvector_output_dim": 512
|
202 |
-
}
|
203 |
-
|
204 |
-
dataset: DatasetDict({
|
205 |
-
train: Dataset({
|
206 |
-
features: ['client_id', 'path', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'target_text'],
|
207 |
-
num_rows: 3478
|
208 |
-
})
|
209 |
-
eval: Dataset({
|
210 |
-
features: ['client_id', 'path', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'target_text'],
|
211 |
-
num_rows: 1647
|
212 |
-
})
|
213 |
-
})
|
214 |
-
vocab: {'-': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, 'â': 28, 'ç': 29, 'ë': 30, 'î': 31, 'ö': 32, 'ü': 33, 'ğ': 34, 'ı': 35, 'ş': 36, '̇': 37, '|': 0, '[UNK]': 38, '[PAD]': 39}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ngram.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
from transformers import AutoProcessor
|
2 |
-
from transformers import Wav2Vec2ProcessorWithLM
|
3 |
-
from huggingface_hub import Repository
|
4 |
-
from pyctcdecode import build_ctcdecoder
|
5 |
-
|
6 |
-
model_name = "cahya/wav2vec2-base-turkish-artificial-cv"
|
7 |
-
processor = AutoProcessor.from_pretrained(model_name)
|
8 |
-
|
9 |
-
vocab_dict = processor.tokenizer.get_vocab()
|
10 |
-
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
|
11 |
-
|
12 |
-
decoder = build_ctcdecoder(
|
13 |
-
labels=list(sorted_vocab_dict.keys()),
|
14 |
-
kenlm_model_path="5gram.arpa",
|
15 |
-
)
|
16 |
-
|
17 |
-
processor_with_lm = Wav2Vec2ProcessorWithLM(
|
18 |
-
feature_extractor=processor.feature_extractor,
|
19 |
-
tokenizer=processor.tokenizer,
|
20 |
-
decoder=decoder
|
21 |
-
)
|
22 |
-
|
23 |
-
#repo = Repository(local_dir="wav2vec2-base-turkish", clone_from=model_name)
|
24 |
-
processor_with_lm.save_pretrained("wav2vec2-base-turkish")
|
25 |
-
#repo.push_to_hub(commit_message="Upload lm-boosted decoder")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test-vocab.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from datasets import load_dataset
|
3 |
-
from transformers import AutoModelForCTC, AutoProcessor
|
4 |
-
import torchaudio.functional as F
|
5 |
-
|
6 |
-
model_id = "cahya/wav2vec2-base-turkish"
|
7 |
-
|
8 |
-
sample_iter = iter(load_dataset("common_voice", "tr", split="test", streaming=True))
|
9 |
-
|
10 |
-
sample = next(sample_iter)
|
11 |
-
resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy()
|
12 |
-
|
13 |
-
model = AutoModelForCTC.from_pretrained(model_id)
|
14 |
-
processor = AutoProcessor.from_pretrained(model_id)
|
15 |
-
|
16 |
-
input_values = processor(resampled_audio, return_tensors="pt").input_values
|
17 |
-
|
18 |
-
with torch.no_grad():
|
19 |
-
logits = model(input_values).logits
|
20 |
-
|
21 |
-
transcription = processor.batch_decode(logits.numpy()).text
|
22 |
-
print(transcription)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wav2vec2-base-turkish
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Subproject commit 84a5ba89d7a3f162d409b42e1b515d9bf2a8d021
|
|
|
|