cahya commited on
Commit
2f2d456
1 Parent(s): c63c37f

clean up the repo

Browse files
Files changed (10) hide show
  1. 5gram.arpa +0 -0
  2. 5gram.bin +0 -3
  3. 5gram.txt +0 -0
  4. README.md.0 +0 -129
  5. arg.txt +0 -34
  6. er2 +0 -259
  7. err +0 -214
  8. ngram.py +0 -25
  9. test-vocab.py +0 -22
  10. wav2vec2-base-turkish +0 -1
5gram.arpa DELETED
The diff for this file is too large to render. See raw diff
 
5gram.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a76859c96afc4fa223dc7e5cb4d000926ec82f25ffbf560afec88ad39ed8783
3
- size 1831539
 
 
 
 
5gram.txt DELETED
The diff for this file is too large to render. See raw diff
 
README.md.0 DELETED
@@ -1,129 +0,0 @@
1
- ---
2
- language: tr
3
- datasets:
4
- - common_voice
5
- metrics:
6
- - wer
7
- tags:
8
- - audio
9
- - automatic-speech-recognition
10
- - speech
11
- - common_voice
12
- - generated_from_trainer
13
- - tr
14
- - robust-speech-event
15
- license: apache-2.0
16
- model-index:
17
- - name: Wav2Vec2 Base Turkish by Cahya
18
- results:
19
- - task:
20
- name: Speech Recognition
21
- type: automatic-speech-recognition
22
- dataset:
23
- name: Common Voice tr
24
- type: common_voice
25
- args: tr
26
- metrics:
27
- - name: Test WER
28
- type: wer
29
- value: 13.70
30
- ---
31
-
32
- # Wav2Vec2-Large-XLSR-Turkish
33
-
34
- This is the model for Wav2Vec2-Base-Turkish-Artificial-CV, a fine-tuned
35
- [cahya/wav2vec2-base-turkish-artificial](https://huggingface.co/cahya/wav2vec2-base-turkish-artificial)
36
- model on [Turkish Common Voice dataset](https://huggingface.co/datasets/common_voice).
37
-
38
- When using this model, make sure that your speech input is sampled at 16kHz.
39
-
40
- ## Usage
41
- The model can be used directly (without a language model) as follows:
42
- ```python
43
- import torch
44
- import torchaudio
45
- from datasets import load_dataset
46
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
47
-
48
- test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
49
-
50
- processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
51
- model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
52
-
53
-
54
- # Preprocessing the datasets.
55
- # We need to read the aduio files as arrays
56
- def speech_file_to_array_fn(batch):
57
- speech_array, sampling_rate = torchaudio.load(batch["path"])
58
- resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
59
- batch["speech"] = resampler(speech_array).squeeze().numpy()
60
- return batch
61
-
62
- test_dataset = test_dataset.map(speech_file_to_array_fn)
63
- inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
64
-
65
- with torch.no_grad():
66
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
67
-
68
- predicted_ids = torch.argmax(logits, dim=-1)
69
-
70
- print("Prediction:", processor.batch_decode(predicted_ids))
71
- print("Reference:", test_dataset[:2]["sentence"])
72
- ```
73
-
74
-
75
- ## Evaluation
76
-
77
- The model can be evaluated as follows on the Turkish test data of Common Voice.
78
-
79
- ```python
80
- import torch
81
- import torchaudio
82
- from datasets import load_dataset, load_metric
83
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
84
- import re
85
-
86
- test_dataset = load_dataset("common_voice", "tr", split="test")
87
- wer = load_metric("wer")
88
-
89
- processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
90
- model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-base-turkish-artificial-cv")
91
- model.to("cuda")
92
-
93
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\‘\”\'\`…\’»«]'
94
-
95
- # Preprocessing the datasets.
96
- # We need to read the aduio files as arrays
97
- def speech_file_to_array_fn(batch):
98
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
99
- speech_array, sampling_rate = torchaudio.load(batch["path"])
100
- resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
101
- batch["speech"] = resampler(speech_array).squeeze().numpy()
102
- return batch
103
-
104
- test_dataset = test_dataset.map(speech_file_to_array_fn)
105
-
106
- # Preprocessing the datasets.
107
- # We need to read the aduio files as arrays
108
- def evaluate(batch):
109
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
110
-
111
- with torch.no_grad():
112
- logits = model(inputs.input_values.to("cuda")).logits
113
-
114
- pred_ids = torch.argmax(logits, dim=-1)
115
- batch["pred_strings"] = processor.batch_decode(pred_ids)
116
- return batch
117
-
118
- result = test_dataset.map(evaluate, batched=True, batch_size=8)
119
-
120
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
121
- ```
122
-
123
- **Test Result**: 13.70 %
124
-
125
- ## Training
126
-
127
- The Common Voice `train`, `validation`, other and invalidated
128
-
129
- The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
arg.txt DELETED
@@ -1,34 +0,0 @@
1
- --dataset_name="common_voice"
2
- --model_name_or_path="cahya/wav2vec2-base-turkish-artificial-cv"
3
- --dataset_config_name="tr"
4
- --output_dir="./output"
5
- --overwrite_output_dir
6
- --num_train_epochs="1"
7
- --per_device_train_batch_size="2"
8
- --per_device_eval_batch_size="2"
9
- --gradient_accumulation_steps="4"
10
- --learning_rate="7.5e-7"
11
- --warmup_steps="2000"
12
- --length_column_name="input_length"
13
- --evaluation_strategy="steps"
14
- --text_column_name="sentence"
15
- --save_steps="500"
16
- --eval_steps="500"
17
- --logging_steps="100"
18
- --layerdrop="0.0"
19
- --activation_dropout="0.1"
20
- --save_total_limit="3"
21
- --freeze_feature_encoder
22
- --feat_proj_dropout="0.0"
23
- --mask_time_prob="0.75"
24
- --mask_time_length="10"
25
- --mask_feature_prob="0.25"
26
- --mask_feature_length="64"
27
- --gradient_checkpointing
28
- --use_auth_token
29
- --fp16=false
30
- --group_by_length
31
- --do_train=true
32
- --do_eval=true
33
- --push_to_hub
34
- --chars_to_ignore , ? . ! \; \: \"\" \% \' \" \' \' \` … \’ » « \‘ '“' '”' � é û
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
er2 DELETED
@@ -1,259 +0,0 @@
1
- loading configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/config.json from cache at /home/cahya/.cache/huggingface/transformers/47f005d7b541562c0734cfe1b8aaf7f644846084b33a9247f5810d5a16d001a7.1c2175954f7220a41c71683d239699eb295d40ec92ac51faac3b85ad4bef2ad8
2
- /home/cahya/Work/MachineLearning/transformers/src/transformers/configuration_utils.py:353: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.
3
- warnings.warn(
4
- Model config Wav2Vec2Config {
5
- "_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv",
6
- "activation_dropout": 0.055,
7
- "adapter_kernel_size": 3,
8
- "adapter_stride": 2,
9
- "add_adapter": false,
10
- "apply_spec_augment": true,
11
- "architectures": [
12
- "Wav2Vec2ForCTC"
13
- ],
14
- "attention_dropout": 0.094,
15
- "bos_token_id": 1,
16
- "classifier_proj_size": 256,
17
- "codevector_dim": 256,
18
- "contrastive_logits_temperature": 0.1,
19
- "conv_bias": false,
20
- "conv_dim": [
21
- 512,
22
- 512,
23
- 512,
24
- 512,
25
- 512,
26
- 512,
27
- 512
28
- ],
29
- "conv_kernel": [
30
- 10,
31
- 3,
32
- 3,
33
- 3,
34
- 3,
35
- 2,
36
- 2
37
- ],
38
- "conv_stride": [
39
- 5,
40
- 2,
41
- 2,
42
- 2,
43
- 2,
44
- 2,
45
- 2
46
- ],
47
- "ctc_loss_reduction": "mean",
48
- "ctc_zero_infinity": true,
49
- "diversity_loss_weight": 0.1,
50
- "do_stable_layer_norm": false,
51
- "eos_token_id": 2,
52
- "feat_extract_activation": "gelu",
53
- "feat_extract_norm": "group",
54
- "feat_proj_dropout": 0.04,
55
- "feat_quantizer_dropout": 0.0,
56
- "final_dropout": 0.1,
57
- "gradient_checkpointing": true,
58
- "hidden_act": "gelu",
59
- "hidden_dropout": 0.047,
60
- "hidden_size": 768,
61
- "initializer_range": 0.02,
62
- "intermediate_size": 3072,
63
- "layer_norm_eps": 1e-05,
64
- "layerdrop": 0.041,
65
- "mask_feature_length": 10,
66
- "mask_feature_min_masks": 0,
67
- "mask_feature_prob": 0.0,
68
- "mask_time_length": 10,
69
- "mask_time_min_masks": 2,
70
- "mask_time_prob": 0.4,
71
- "model_type": "wav2vec2",
72
- "num_adapter_layers": 3,
73
- "num_attention_heads": 12,
74
- "num_codevector_groups": 2,
75
- "num_codevectors_per_group": 320,
76
- "num_conv_pos_embedding_groups": 16,
77
- "num_conv_pos_embeddings": 128,
78
- "num_feat_extract_layers": 7,
79
- "num_hidden_layers": 12,
80
- "num_negatives": 100,
81
- "output_hidden_size": 768,
82
- "pad_token_id": 39,
83
- "proj_codevector_dim": 256,
84
- "tdnn_dilation": [
85
- 1,
86
- 2,
87
- 3,
88
- 1,
89
- 1
90
- ],
91
- "tdnn_dim": [
92
- 512,
93
- 512,
94
- 512,
95
- 512,
96
- 1500
97
- ],
98
- "tdnn_kernel": [
99
- 5,
100
- 3,
101
- 3,
102
- 1,
103
- 1
104
- ],
105
- "transformers_version": "4.17.0.dev0",
106
- "use_weighted_layer_sum": false,
107
- "vocab_size": 40,
108
- "xvector_output_dim": 512
109
- }
110
-
111
-
112
  0%| | 0/1 [00:00<?, ?ba/s]
113
-
114
  0%| | 0/1 [00:00<?, ?ba/s]
115
- Didn't find file ./output/tokenizer_config.json. We won't load it.
116
- Didn't find file ./output/added_tokens.json. We won't load it.
117
- Didn't find file ./output/special_tokens_map.json. We won't load it.
118
- Didn't find file ./output/tokenizer.json. We won't load it.
119
- loading file ./output/vocab.json
120
- loading file None
121
- loading file None
122
- loading file None
123
- loading file None
124
- file ./output/config.json not found
125
- Adding <s> to the vocabulary
126
- Adding </s> to the vocabulary
127
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
128
- loading configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/config.json from cache at /home/cahya/.cache/huggingface/transformers/47f005d7b541562c0734cfe1b8aaf7f644846084b33a9247f5810d5a16d001a7.1c2175954f7220a41c71683d239699eb295d40ec92ac51faac3b85ad4bef2ad8
129
- Model config Wav2Vec2Config {
130
- "_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv",
131
- "activation_dropout": 0.055,
132
- "adapter_kernel_size": 3,
133
- "adapter_stride": 2,
134
- "add_adapter": false,
135
- "apply_spec_augment": true,
136
- "architectures": [
137
- "Wav2Vec2ForCTC"
138
- ],
139
- "attention_dropout": 0.094,
140
- "bos_token_id": 1,
141
- "classifier_proj_size": 256,
142
- "codevector_dim": 256,
143
- "contrastive_logits_temperature": 0.1,
144
- "conv_bias": false,
145
- "conv_dim": [
146
- 512,
147
- 512,
148
- 512,
149
- 512,
150
- 512,
151
- 512,
152
- 512
153
- ],
154
- "conv_kernel": [
155
- 10,
156
- 3,
157
- 3,
158
- 3,
159
- 3,
160
- 2,
161
- 2
162
- ],
163
- "conv_stride": [
164
- 5,
165
- 2,
166
- 2,
167
- 2,
168
- 2,
169
- 2,
170
- 2
171
- ],
172
- "ctc_loss_reduction": "mean",
173
- "ctc_zero_infinity": true,
174
- "diversity_loss_weight": 0.1,
175
- "do_stable_layer_norm": false,
176
- "eos_token_id": 2,
177
- "feat_extract_activation": "gelu",
178
- "feat_extract_norm": "group",
179
- "feat_proj_dropout": 0.04,
180
- "feat_quantizer_dropout": 0.0,
181
- "final_dropout": 0.1,
182
- "gradient_checkpointing": true,
183
- "hidden_act": "gelu",
184
- "hidden_dropout": 0.047,
185
- "hidden_size": 768,
186
- "initializer_range": 0.02,
187
- "intermediate_size": 3072,
188
- "layer_norm_eps": 1e-05,
189
- "layerdrop": 0.041,
190
- "mask_feature_length": 10,
191
- "mask_feature_min_masks": 0,
192
- "mask_feature_prob": 0.0,
193
- "mask_time_length": 10,
194
- "mask_time_min_masks": 2,
195
- "mask_time_prob": 0.4,
196
- "model_type": "wav2vec2",
197
- "num_adapter_layers": 3,
198
- "num_attention_heads": 12,
199
- "num_codevector_groups": 2,
200
- "num_codevectors_per_group": 320,
201
- "num_conv_pos_embedding_groups": 16,
202
- "num_conv_pos_embeddings": 128,
203
- "num_feat_extract_layers": 7,
204
- "num_hidden_layers": 12,
205
- "num_negatives": 100,
206
- "output_hidden_size": 768,
207
- "pad_token_id": 39,
208
- "proj_codevector_dim": 256,
209
- "tdnn_dilation": [
210
- 1,
211
- 2,
212
- 3,
213
- 1,
214
- 1
215
- ],
216
- "tdnn_dim": [
217
- 512,
218
- 512,
219
- 512,
220
- 512,
221
- 1500
222
- ],
223
- "tdnn_kernel": [
224
- 5,
225
- 3,
226
- 3,
227
- 1,
228
- 1
229
- ],
230
- "transformers_version": "4.17.0.dev0",
231
- "use_weighted_layer_sum": false,
232
- "vocab_size": 40,
233
- "xvector_output_dim": 512
234
- }
235
-
236
- loading feature extractor configuration file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/preprocessor_config.json from cache at /home/cahya/.cache/huggingface/transformers/34433162acde7e1ca4a265d8ae309442e4ddadff37e6e37d2d37eb7133f65f8f.fcd266b775b7f33ba9b607a0fee7cc615aeb2eb281586f046280492ea380ae23
237
- Feature extractor Wav2Vec2FeatureExtractor {
238
- "do_normalize": true,
239
- "feature_extractor_type": "Wav2Vec2FeatureExtractor",
240
- "feature_size": 1,
241
- "padding_side": "right",
242
- "padding_value": 0.0,
243
- "return_attention_mask": true,
244
- "sampling_rate": 16000
245
- }
246
-
247
- loading weights file https://huggingface.co/cahya/wav2vec2-base-turkish-artificial-cv/resolve/main/pytorch_model.bin from cache at /home/cahya/.cache/huggingface/transformers/3b3f7d0041c2b08b031c8357e39249bdbc06c8bfcd5a9f8891c7f259b07a0b85.356b4eec0d55a5c4d2d480c2dd2ea2cc0c867771bc39b8cdc97b629e4206482c
248
- Traceback (most recent call last):
249
- File "run_speech_recognition_ctc.py", line 745, in <module>
250
- main()
251
- File "run_speech_recognition_ctc.py", line 552, in main
252
- model = AutoModelForCTC.from_pretrained(
253
- File "/home/cahya/Work/MachineLearning/transformers/src/transformers/models/auto/auto_factory.py", line 447, in from_pretrained
254
- return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
255
- File "/home/cahya/Work/MachineLearning/transformers/src/transformers/modeling_utils.py", line 1528, in from_pretrained
256
- model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_state_dict_into_model(
257
- File "/home/cahya/Work/MachineLearning/transformers/src/transformers/modeling_utils.py", line 1682, in _load_state_dict_into_model
258
- raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
259
- RuntimeError: Error(s) in loading state_dict for Wav2Vec2ForCTC:
260
- size mismatch for lm_head.weight: copying a param with shape torch.Size([40, 768]) from checkpoint, the shape in current model is torch.Size([41, 768]).
261
- size mismatch for lm_head.bias: copying a param with shape torch.Size([40]) from checkpoint, the shape in current model is torch.Size([41]).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/1 [00:00<?, ?ba/s]
 
1
  0%| | 0/1 [00:00<?, ?ba/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
err DELETED
@@ -1,214 +0,0 @@
1
- training_args.do_train: True
2
- 01/28/2022 11:13:09 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False
3
- 01/28/2022 11:13:09 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
4
- _n_gpu=1,
5
- adafactor=False,
6
- adam_beta1=0.9,
7
- adam_beta2=0.999,
8
- adam_epsilon=1e-08,
9
- bf16=False,
10
- bf16_full_eval=False,
11
- dataloader_drop_last=False,
12
- dataloader_num_workers=0,
13
- dataloader_pin_memory=True,
14
- ddp_bucket_cap_mb=None,
15
- ddp_find_unused_parameters=None,
16
- debug=[],
17
- deepspeed=None,
18
- disable_tqdm=False,
19
- do_eval=True,
20
- do_predict=False,
21
- do_train=True,
22
- eval_accumulation_steps=None,
23
- eval_steps=500,
24
- evaluation_strategy=IntervalStrategy.STEPS,
25
- fp16=False,
26
- fp16_backend=auto,
27
- fp16_full_eval=False,
28
- fp16_opt_level=O1,
29
- gradient_accumulation_steps=4,
30
- gradient_checkpointing=True,
31
- greater_is_better=None,
32
- group_by_length=True,
33
- half_precision_backend=auto,
34
- hub_model_id=None,
35
- hub_strategy=HubStrategy.EVERY_SAVE,
36
- hub_token=<HUB_TOKEN>,
37
- ignore_data_skip=False,
38
- label_names=None,
39
- label_smoothing_factor=0.0,
40
- learning_rate=7.5e-07,
41
- length_column_name=input_length,
42
- load_best_model_at_end=False,
43
- local_rank=-1,
44
- log_level=-1,
45
- log_level_replica=-1,
46
- log_on_each_node=True,
47
- logging_dir=./output/runs/Jan28_11-13-09_arjuna,
48
- logging_first_step=False,
49
- logging_nan_inf_filter=True,
50
- logging_steps=100,
51
- logging_strategy=IntervalStrategy.STEPS,
52
- lr_scheduler_type=SchedulerType.LINEAR,
53
- max_grad_norm=1.0,
54
- max_steps=-1,
55
- metric_for_best_model=None,
56
- mp_parameters=,
57
- no_cuda=False,
58
- num_train_epochs=1.0,
59
- optim=OptimizerNames.ADAMW_HF,
60
- output_dir=./output,
61
- overwrite_output_dir=True,
62
- past_index=-1,
63
- per_device_eval_batch_size=2,
64
- per_device_train_batch_size=2,
65
- prediction_loss_only=False,
66
- push_to_hub=True,
67
- push_to_hub_model_id=None,
68
- push_to_hub_organization=None,
69
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
70
- remove_unused_columns=True,
71
- report_to=['tensorboard'],
72
- resume_from_checkpoint=None,
73
- run_name=./output,
74
- save_on_each_node=False,
75
- save_steps=500,
76
- save_strategy=IntervalStrategy.STEPS,
77
- save_total_limit=3,
78
- seed=42,
79
- sharded_ddp=[],
80
- skip_memory_metrics=True,
81
- tf32=None,
82
- tpu_metrics_debug=False,
83
- tpu_num_cores=None,
84
- use_legacy_prediction_loop=False,
85
- warmup_ratio=0.0,
86
- warmup_steps=2000,
87
- weight_decay=0.0,
88
- xpu_backend=None,
89
- )
90
- do_train: True
91
- load train
92
- 01/28/2022 11:13:09 - WARNING - datasets.builder - Reusing dataset common_voice (/home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)
93
- 01/28/2022 11:13:10 - WARNING - datasets.builder - Reusing dataset common_voice (/home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)
94
- char ignored: [',', '?', '.', '!', ';', ':', '""', '%', "'", '"', "'", "'", '`', '…', '’', '»', '«', '‘', '“', '”', '�', 'é', 'û'] [,?.!;:""%'"''`…’»«‘“”�éû]
95
- 01/28/2022 11:13:10 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-a0df3a81748e62dd.arrow
96
- 01/28/2022 11:13:10 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/cahya/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd/cache-859966f17c7349fb.arrow
97
- config: Wav2Vec2Config {
98
- "_name_or_path": "cahya/wav2vec2-base-turkish-artificial-cv",
99
- "activation_dropout": 0.055,
100
- "adapter_kernel_size": 3,
101
- "adapter_stride": 2,
102
- "add_adapter": false,
103
- "apply_spec_augment": true,
104
- "architectures": [
105
- "Wav2Vec2ForCTC"
106
- ],
107
- "attention_dropout": 0.094,
108
- "bos_token_id": 1,
109
- "classifier_proj_size": 256,
110
- "codevector_dim": 256,
111
- "contrastive_logits_temperature": 0.1,
112
- "conv_bias": false,
113
- "conv_dim": [
114
- 512,
115
- 512,
116
- 512,
117
- 512,
118
- 512,
119
- 512,
120
- 512
121
- ],
122
- "conv_kernel": [
123
- 10,
124
- 3,
125
- 3,
126
- 3,
127
- 3,
128
- 2,
129
- 2
130
- ],
131
- "conv_stride": [
132
- 5,
133
- 2,
134
- 2,
135
- 2,
136
- 2,
137
- 2,
138
- 2
139
- ],
140
- "ctc_loss_reduction": "mean",
141
- "ctc_zero_infinity": true,
142
- "diversity_loss_weight": 0.1,
143
- "do_stable_layer_norm": false,
144
- "eos_token_id": 2,
145
- "feat_extract_activation": "gelu",
146
- "feat_extract_norm": "group",
147
- "feat_proj_dropout": 0.04,
148
- "feat_quantizer_dropout": 0.0,
149
- "final_dropout": 0.1,
150
- "gradient_checkpointing": true,
151
- "hidden_act": "gelu",
152
- "hidden_dropout": 0.047,
153
- "hidden_size": 768,
154
- "initializer_range": 0.02,
155
- "intermediate_size": 3072,
156
- "layer_norm_eps": 1e-05,
157
- "layerdrop": 0.041,
158
- "mask_feature_length": 10,
159
- "mask_feature_min_masks": 0,
160
- "mask_feature_prob": 0.0,
161
- "mask_time_length": 10,
162
- "mask_time_min_masks": 2,
163
- "mask_time_prob": 0.4,
164
- "model_type": "wav2vec2",
165
- "num_adapter_layers": 3,
166
- "num_attention_heads": 12,
167
- "num_codevector_groups": 2,
168
- "num_codevectors_per_group": 320,
169
- "num_conv_pos_embedding_groups": 16,
170
- "num_conv_pos_embeddings": 128,
171
- "num_feat_extract_layers": 7,
172
- "num_hidden_layers": 12,
173
- "num_negatives": 100,
174
- "output_hidden_size": 768,
175
- "pad_token_id": 39,
176
- "proj_codevector_dim": 256,
177
- "tdnn_dilation": [
178
- 1,
179
- 2,
180
- 3,
181
- 1,
182
- 1
183
- ],
184
- "tdnn_dim": [
185
- 512,
186
- 512,
187
- 512,
188
- 512,
189
- 1500
190
- ],
191
- "tdnn_kernel": [
192
- 5,
193
- 3,
194
- 3,
195
- 1,
196
- 1
197
- ],
198
- "transformers_version": "4.17.0.dev0",
199
- "use_weighted_layer_sum": false,
200
- "vocab_size": 40,
201
- "xvector_output_dim": 512
202
- }
203
-
204
- dataset: DatasetDict({
205
- train: Dataset({
206
- features: ['client_id', 'path', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'target_text'],
207
- num_rows: 3478
208
- })
209
- eval: Dataset({
210
- features: ['client_id', 'path', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'target_text'],
211
- num_rows: 1647
212
- })
213
- })
214
- vocab: {'-': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, 'â': 28, 'ç': 29, 'ë': 30, 'î': 31, 'ö': 32, 'ü': 33, 'ğ': 34, 'ı': 35, 'ş': 36, '̇': 37, '|': 0, '[UNK]': 38, '[PAD]': 39}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ngram.py DELETED
@@ -1,25 +0,0 @@
1
- from transformers import AutoProcessor
2
- from transformers import Wav2Vec2ProcessorWithLM
3
- from huggingface_hub import Repository
4
- from pyctcdecode import build_ctcdecoder
5
-
6
- model_name = "cahya/wav2vec2-base-turkish-artificial-cv"
7
- processor = AutoProcessor.from_pretrained(model_name)
8
-
9
- vocab_dict = processor.tokenizer.get_vocab()
10
- sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
11
-
12
- decoder = build_ctcdecoder(
13
- labels=list(sorted_vocab_dict.keys()),
14
- kenlm_model_path="5gram.arpa",
15
- )
16
-
17
- processor_with_lm = Wav2Vec2ProcessorWithLM(
18
- feature_extractor=processor.feature_extractor,
19
- tokenizer=processor.tokenizer,
20
- decoder=decoder
21
- )
22
-
23
- #repo = Repository(local_dir="wav2vec2-base-turkish", clone_from=model_name)
24
- processor_with_lm.save_pretrained("wav2vec2-base-turkish")
25
- #repo.push_to_hub(commit_message="Upload lm-boosted decoder")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test-vocab.py DELETED
@@ -1,22 +0,0 @@
1
- import torch
2
- from datasets import load_dataset
3
- from transformers import AutoModelForCTC, AutoProcessor
4
- import torchaudio.functional as F
5
-
6
- model_id = "cahya/wav2vec2-base-turkish"
7
-
8
- sample_iter = iter(load_dataset("common_voice", "tr", split="test", streaming=True))
9
-
10
- sample = next(sample_iter)
11
- resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy()
12
-
13
- model = AutoModelForCTC.from_pretrained(model_id)
14
- processor = AutoProcessor.from_pretrained(model_id)
15
-
16
- input_values = processor(resampled_audio, return_tensors="pt").input_values
17
-
18
- with torch.no_grad():
19
- logits = model(input_values).logits
20
-
21
- transcription = processor.batch_decode(logits.numpy()).text
22
- print(transcription)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wav2vec2-base-turkish DELETED
@@ -1 +0,0 @@
1
- Subproject commit 84a5ba89d7a3f162d409b42e1b515d9bf2a8d021