dragonSwing
commited on
Commit
·
b8e25eb
1
Parent(s):
1089497
Upload files and model
Browse files- .gitattributes +1 -0
- README.md +125 -0
- config.json +77 -0
- custom.py +76 -0
- example.wav +0 -0
- hyperparams.yaml +72 -0
- model.ckpt +3 -0
- preprocessor_config.json +9 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- vocab.json +1 -0
.gitattributes
CHANGED
@@ -16,6 +16,7 @@
|
|
16 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
19 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
16 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
20 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
21 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
22 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: vi
|
3 |
+
datasets:
|
4 |
+
- vivos
|
5 |
+
- common_voice
|
6 |
+
metrics:
|
7 |
+
- wer
|
8 |
+
tags:
|
9 |
+
- audio
|
10 |
+
- automatic-speech-recognition
|
11 |
+
- speech
|
12 |
+
license: cc-by-nc-4.0
|
13 |
+
widget:
|
14 |
+
- example_title: VLSP ASR 2020 test T1
|
15 |
+
src: https://huggingface.co/nguyenvulebinh/wav2vec2-base-vietnamese-250h/raw/main/audio-test/t1_0001-00010.wav
|
16 |
+
- example_title: VLSP ASR 2020 test T1
|
17 |
+
src: https://huggingface.co/nguyenvulebinh/wav2vec2-base-vietnamese-250h/raw/main/audio-test/t1_utt000000042.wav
|
18 |
+
- example_title: VLSP ASR 2020 test T2
|
19 |
+
src: https://huggingface.co/nguyenvulebinh/wav2vec2-base-vietnamese-250h/raw/main/audio-test/t2_0000006682.wav
|
20 |
+
model-index:
|
21 |
+
- name: Wav2vec2 Base Vietnamese 270h
|
22 |
+
results:
|
23 |
+
- task:
|
24 |
+
name: Speech Recognition
|
25 |
+
type: automatic-speech-recognition
|
26 |
+
dataset:
|
27 |
+
name: Common Voice vi
|
28 |
+
type: common_voice
|
29 |
+
args: vi
|
30 |
+
metrics:
|
31 |
+
- name: Test WER
|
32 |
+
type: wer
|
33 |
+
value: 9.66
|
34 |
+
- task:
|
35 |
+
name: Speech Recognition
|
36 |
+
type: automatic-speech-recognition
|
37 |
+
dataset:
|
38 |
+
name: VIVOS
|
39 |
+
type: vivos
|
40 |
+
args: vi
|
41 |
+
metrics:
|
42 |
+
- name: Test WER
|
43 |
+
type: wer
|
44 |
+
value: 4.04
|
45 |
+
---
|
46 |
+
# Wav2Vec2-Base-Vietnamese-270h
|
47 |
+
Fine-tuned Wav2Vec2 model on Vietnamese Speech Recognition task using about 270h labelled data combined from multiple datasets including [Common Voice](https://huggingface.co/datasets/common_voice), [VIVOS](https://huggingface.co/datasets/vivos), [VLSP2020](https://vlsp.org.vn/vlsp2020/eval/asr). The model was fine-tuned using SpeechBrain toolkit with a custom tokenizer. For a better experience, we encourage you to learn more about [SpeechBrain](https://speechbrain.github.io/).
|
48 |
+
When using this model, make sure that your speech input is sampled at 16kHz.
|
49 |
+
Please refer to [huggingface blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) on how to fine-tune this model on a specific language.
|
50 |
+
|
51 |
+
### Benchmark WER result:
|
52 |
+
| | [VIVOS](https://huggingface.co/datasets/vivos) | [COMMON VOICE VI](https://huggingface.co/datasets/common_voice) |
|
53 |
+
|---|---|---|
|
54 |
+
|without LM| 8.41 | 17.82 |
|
55 |
+
|with 4-grams LM| 4.04 | 9.66 |
|
56 |
+
|
57 |
+
The language model was trained using [Oscar](https://huggingface.co/datasets/oscar-corpus/OSCAR-2109) dataset on about 32GB of written text.
|
58 |
+
|
59 |
+
### Usage
|
60 |
+
The model can be used directly (without a language model) as follows:
|
61 |
+
```python
|
62 |
+
from speechbrain.pretrained import EncoderASR
|
63 |
+
|
64 |
+
model = EncoderASR.from_hparams(source="dragonSwing/wav2vec2-base-vn-270h", savedir="pretrained_models/asr-wav2vec2-vi")
|
65 |
+
model.transcribe_file('dragonSwing/wav2vec2-base-vn-270h/example.wav')
|
66 |
+
```
|
67 |
+
|
68 |
+
### Inference on GPU
|
69 |
+
To perform inference on the GPU, add `run_opts={"device":"cuda"}` when calling the `from_hparams` method.
|
70 |
+
|
71 |
+
### Evaluation
|
72 |
+
The model can be evaluated as follows on the Vietnamese test data of Common Voice.
|
73 |
+
```python
|
74 |
+
import torch
|
75 |
+
import torchaudio
|
76 |
+
from datasets import load_dataset, load_metric, Audio
|
77 |
+
from transformers import Wav2Vec2FeatureExtractor
|
78 |
+
from speechbrain.pretrained import EncoderASR
|
79 |
+
import re
|
80 |
+
test_dataset = load_dataset("common_voice", "vi", split="test")
|
81 |
+
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
|
82 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
83 |
+
wer = load_metric("wer")
|
84 |
+
extractor = Wav2Vec2FeatureExtractor.from_pretrained("dragonSwing/wav2vec2-base-vn-270h")
|
85 |
+
model = EncoderASR.from_hparams(source="dragonSwing/wav2vec2-base-vn-270h", savedir="pretrained_models/asr-wav2vec2-vi", run_opts={'device': device})
|
86 |
+
chars_to_ignore_regex = r'[,?.!\-;:"“%\'�]'
|
87 |
+
# Preprocessing the datasets.
|
88 |
+
# We need to read the audio files as arrays
|
89 |
+
def speech_file_to_array_fn(batch):
|
90 |
+
audio = batch["audio"]
|
91 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
92 |
+
batch['speech'] = audio['array']
|
93 |
+
return batch
|
94 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
95 |
+
|
96 |
+
def evaluate(batch):
|
97 |
+
# For padding inputs only
|
98 |
+
inputs = extractor(
|
99 |
+
batch['speech'],
|
100 |
+
sampling_rate=16000,
|
101 |
+
return_tensors="pt",
|
102 |
+
padding=True,
|
103 |
+
do_normalize=False
|
104 |
+
).input_values
|
105 |
+
input_lens = torch.ones(inputs.shape[0])
|
106 |
+
pred_str, pred_tokens = model.transcribe_batch(inputs, input_lens)
|
107 |
+
batch["pred_strings"] = pred_str
|
108 |
+
|
109 |
+
return batch
|
110 |
+
result = test_dataset.map(evaluate, batched=True, batch_size=4)
|
111 |
+
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["target_text"])))
|
112 |
+
```
|
113 |
+
**Test Result**: 17.817680%
|
114 |
+
|
115 |
+
#### Citation
|
116 |
+
```
|
117 |
+
@misc{SB2021,
|
118 |
+
author = {Ravanelli, Mirco and Parcollet, Titouan and Rouhe, Aku and Plantinga, Peter and Rastorgueva, Elena and Lugosch, Loren and Dawalatabad, Nauman and Ju-Chieh, Chou and Heba, Abdel and Grondin, Francois and Aris, William and Liao, Chien-Feng and Cornell, Samuele and Yeh, Sung-Lin and Na, Hwidong and Gao, Yan and Fu, Szu-Wei and Subakan, Cem and De Mori, Renato and Bengio, Yoshua },
|
119 |
+
title = {SpeechBrain},
|
120 |
+
year = {2021},
|
121 |
+
publisher = {GitHub},
|
122 |
+
journal = {GitHub repository},
|
123 |
+
howpublished = {\\\\url{https://github.com/speechbrain/speechbrain}},
|
124 |
+
}
|
125 |
+
```
|
config.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"speechbrain_interface": "EncoderASR",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"apply_spec_augment": true,
|
5 |
+
"architectures": [
|
6 |
+
"Wav2Vec2Model"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"codevector_dim": 256,
|
11 |
+
"contrastive_logits_temperature": 0.1,
|
12 |
+
"conv_bias": false,
|
13 |
+
"conv_dim": [
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512
|
21 |
+
],
|
22 |
+
"conv_kernel": [
|
23 |
+
10,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
3,
|
28 |
+
2,
|
29 |
+
2
|
30 |
+
],
|
31 |
+
"conv_stride": [
|
32 |
+
5,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2,
|
38 |
+
2
|
39 |
+
],
|
40 |
+
"ctc_loss_reduction": "mean",
|
41 |
+
"ctc_zero_infinity": false,
|
42 |
+
"diversity_loss_weight": 0.1,
|
43 |
+
"do_stable_layer_norm": false,
|
44 |
+
"eos_token_id": 2,
|
45 |
+
"feat_extract_activation": "gelu",
|
46 |
+
"feat_extract_dropout": 0.0,
|
47 |
+
"feat_extract_norm": "group",
|
48 |
+
"feat_proj_dropout": 0.1,
|
49 |
+
"feat_quantizer_dropout": 0.0,
|
50 |
+
"final_dropout": 0.1,
|
51 |
+
"hidden_act": "gelu",
|
52 |
+
"hidden_dropout": 0.1,
|
53 |
+
"hidden_dropout_prob": 0.1,
|
54 |
+
"hidden_size": 768,
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"layer_norm_eps": 1e-05,
|
58 |
+
"layerdrop": 0.1,
|
59 |
+
"mask_feature_length": 10,
|
60 |
+
"mask_feature_prob": 0.0,
|
61 |
+
"mask_time_length": 10,
|
62 |
+
"mask_time_prob": 0.05,
|
63 |
+
"model_type": "wav2vec2",
|
64 |
+
"num_attention_heads": 12,
|
65 |
+
"num_codevector_groups": 2,
|
66 |
+
"num_codevectors_per_group": 320,
|
67 |
+
"num_conv_pos_embedding_groups": 16,
|
68 |
+
"num_conv_pos_embeddings": 128,
|
69 |
+
"num_feat_extract_layers": 7,
|
70 |
+
"num_hidden_layers": 12,
|
71 |
+
"num_negatives": 100,
|
72 |
+
"pad_token_id": 109,
|
73 |
+
"proj_codevector_dim": 256,
|
74 |
+
"torch_dtype": "float32",
|
75 |
+
"transformers_version": "4.6.1",
|
76 |
+
"vocab_size": 110
|
77 |
+
}
|
custom.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Wav2Vec2CTCTokenizer
|
2 |
+
|
3 |
+
class Wav2Vec2WordpieceTokenizer(Wav2Vec2CTCTokenizer):
|
4 |
+
def __init__(
|
5 |
+
self,
|
6 |
+
vocab_file,
|
7 |
+
bos_token="<s>",
|
8 |
+
eos_token="</s>",
|
9 |
+
unk_token="<unk>",
|
10 |
+
pad_token="<pad>",
|
11 |
+
word_delimiter_token="|",
|
12 |
+
do_lower_case=False,
|
13 |
+
**kwargs
|
14 |
+
):
|
15 |
+
super().__init__(
|
16 |
+
vocab_file=vocab_file,
|
17 |
+
unk_token=unk_token,
|
18 |
+
bos_token=bos_token,
|
19 |
+
eos_token=eos_token,
|
20 |
+
pad_token=pad_token,
|
21 |
+
do_lower_case=do_lower_case,
|
22 |
+
word_delimiter_token=word_delimiter_token,
|
23 |
+
**kwargs,
|
24 |
+
)
|
25 |
+
|
26 |
+
self._create_trie(self.all_special_tokens_extended)
|
27 |
+
|
28 |
+
def _tokenize(self, text, **kwargs):
|
29 |
+
"""
|
30 |
+
Converts a string in a sequence of tokens (string), using the tokenizer.
|
31 |
+
"""
|
32 |
+
special_cases = set(['gia', 'qui', 'quy', 'que', 'qua'])
|
33 |
+
output_tokens = []
|
34 |
+
for token_idx, token in enumerate(text.split()):
|
35 |
+
if token in special_cases:
|
36 |
+
sub_tokens = [token[:2], token[2:]]
|
37 |
+
else:
|
38 |
+
end = len(token)
|
39 |
+
sub_tokens = []
|
40 |
+
while end > 0:
|
41 |
+
start = 0
|
42 |
+
cur_substr = None
|
43 |
+
while start < end:
|
44 |
+
substr = token[start:end]
|
45 |
+
if substr in self.encoder:
|
46 |
+
cur_substr = substr
|
47 |
+
break
|
48 |
+
start += 1
|
49 |
+
if cur_substr is None:
|
50 |
+
sub_tokens.insert(0, self.unk_token)
|
51 |
+
end = start - 1
|
52 |
+
else:
|
53 |
+
sub_tokens.insert(0, cur_substr)
|
54 |
+
end = start
|
55 |
+
|
56 |
+
if token_idx > 0:
|
57 |
+
output_tokens.append(self.word_delimiter_token)
|
58 |
+
output_tokens.extend(sub_tokens)
|
59 |
+
return output_tokens
|
60 |
+
|
61 |
+
def decode_ids(
|
62 |
+
self,
|
63 |
+
token_ids,
|
64 |
+
skip_special_tokens = False,
|
65 |
+
clean_up_tokenization_spaces = True,
|
66 |
+
group_tokens: bool = True,
|
67 |
+
spaces_between_special_tokens: bool = False,
|
68 |
+
) -> str:
|
69 |
+
# For compatible with speechbrain interfaces
|
70 |
+
return self.decode(
|
71 |
+
token_ids,
|
72 |
+
skip_special_tokens=skip_special_tokens,
|
73 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
74 |
+
group_tokens=group_tokens,
|
75 |
+
spaces_between_special_tokens=spaces_between_special_tokens
|
76 |
+
)
|
example.wav
ADDED
Binary file (49.6 kB). View file
|
|
hyperparams.yaml
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ################################
|
2 |
+
# Model: wav2vec2 + DNN + CTC/Attention
|
3 |
+
# Augmentation: SpecAugment
|
4 |
+
# Authors: Titouan Parcollet 2021
|
5 |
+
# ################################
|
6 |
+
|
7 |
+
sample_rate: 16000
|
8 |
+
wav2vec2_hub: dragonSwing/wav2vec2-base-vn-270h
|
9 |
+
|
10 |
+
# Model parameters
|
11 |
+
activation: !name:torch.nn.LeakyReLU
|
12 |
+
dnn_layers: 2
|
13 |
+
dnn_neurons: 768
|
14 |
+
emb_size: 128
|
15 |
+
dec_neurons: 768
|
16 |
+
dropout_prob: 0.1
|
17 |
+
|
18 |
+
# Outputs
|
19 |
+
output_neurons: 696 # BPE size, index(blank/eos/bos) = 0
|
20 |
+
output_tones: 12
|
21 |
+
|
22 |
+
# Decoding parameters
|
23 |
+
# Be sure that the bos and eos index match with the BPEs ones
|
24 |
+
blank_index: 0
|
25 |
+
bos_index: 1
|
26 |
+
eos_index: 2
|
27 |
+
unk_index: 3
|
28 |
+
|
29 |
+
tokenizer: !apply:custom.Wav2Vec2WordpieceTokenizer.from_pretrained
|
30 |
+
pretrained_model_name_or_path: !ref <wav2vec2_hub>
|
31 |
+
|
32 |
+
dropout: !new:torch.nn.Dropout
|
33 |
+
p: !ref <dropout_prob>
|
34 |
+
|
35 |
+
wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
|
36 |
+
source: !ref <wav2vec2_hub>
|
37 |
+
output_norm: True
|
38 |
+
freeze: True
|
39 |
+
pretrain: False
|
40 |
+
save_path: null
|
41 |
+
|
42 |
+
lm_head: !new:torch.nn.Linear
|
43 |
+
in_features: !ref <dnn_neurons>
|
44 |
+
out_features: !ref <output_neurons>
|
45 |
+
|
46 |
+
tone_head: !new:torch.nn.Linear
|
47 |
+
in_features: !ref <dnn_neurons>
|
48 |
+
out_features: !ref <output_tones>
|
49 |
+
|
50 |
+
log_softmax: !new:speechbrain.nnet.activations.Softmax
|
51 |
+
apply_log: True
|
52 |
+
|
53 |
+
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
|
54 |
+
blank_index: !ref <blank_index>
|
55 |
+
|
56 |
+
model: !new:torch.nn.ModuleList
|
57 |
+
- [!ref <wav2vec2>, !ref <dropout>, !ref <lm_head>, !ref <tone_head>]
|
58 |
+
|
59 |
+
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
|
60 |
+
wav2vec2: !ref <wav2vec2>
|
61 |
+
dropout: !ref <dropout>
|
62 |
+
lm_head: !ref <lm_head>
|
63 |
+
|
64 |
+
decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
|
65 |
+
blank_id: !ref <blank_index>
|
66 |
+
|
67 |
+
modules:
|
68 |
+
encoder: !ref <encoder>
|
69 |
+
|
70 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
71 |
+
loadables:
|
72 |
+
model: !ref <model>
|
model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e315a64b704fff992630eccd824c2780ec79c346b2c64518ee9b7845af03a65c
|
3 |
+
size 379749523
|
preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"return_attention_mask": false,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2WordpieceTokenizer"}
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{ "<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "a": 5, "á": 6, "à": 7, "ả": 8, "ã": 9, "ạ": 10, "e": 11, "é": 12, "è": 13, "ẻ": 14, "ẽ": 15, "ẹ": 16, "ê": 17, "ế": 18, "ề": 19, "ể": 20, "ễ": 21, "ệ": 22, "i": 23, "í": 24, "ì": 25, "ỉ": 26, "ĩ": 27, "ị": 28, "o": 29, "ó": 30, "ò": 31, "ỏ": 32, "õ": 33, "ọ": 34, "ơ": 35, "ớ": 36, "ờ": 37, "ở": 38, "ỡ": 39, "ợ": 40, "ô": 41, "ố": 42, "ồ": 43, "ổ": 44, "ỗ": 45, "ộ": 46, "u": 47, "ú": 48, "ù": 49, "ủ": 50, "ũ": 51, "ụ": 52, "ư": 53, "ứ": 54, "ừ": 55, "ử": 56, "ữ": 57, "ự": 58, "y": 59, "ỳ": 60, "ý": 61, "ỷ": 62, "ỹ": 63, "ỵ": 64, "ă": 65, "ắ": 66, "ằ": 67, "ẳ": 68, "ẵ": 69, "ặ": 70, "â": 71, "ấ": 72, "ầ": 73, "ẩ": 74, "ẫ": 75, "ậ": 76, "đ": 77, "q": 78, "w": 79, "r": 80, "t": 81, "p": 82, "s": 83, "d": 84, "f": 85, "g": 86, "h": 87, "j": 88, "k": 89, "l": 90, "z": 91, "x": 92, "c": 93, "v": 94, "b": 95, "n": 96, "m": 97, "th": 98, "ch": 99, "kh": 100, "ph": 101, "nh": 102, "gh": 103, "qu": 104, "ng": 105, "ngh": 106, "tr": 107, "ác": 108, "ạc": 109, "ai": 110, "ái": 111, "ài": 112, "ải": 113, "ãi": 114, "ại": 115, "am": 116, "ám": 117, "àm": 118, "ảm": 119, "ãm": 120, "ạm": 121, "an": 122, "án": 123, "àn": 124, "ản": 125, "ãn": 126, "ạn": 127, "ao": 128, "áo": 129, "ào": 130, "ảo": 131, "ão": 132, "ạo": 133, "au": 134, "áu": 135, "àu": 136, "ảu": 137, "ãu": 138, "ạu": 139, "áp": 140, "ạp": 141, "át": 142, "ạt": 143, "ay": 144, "áy": 145, "ày": 146, "ảy": 147, "ãy": 148, "ạy": 149, "ắc": 150, "ặc": 151, "ăm": 152, "ằm": 153, "ắm": 154, "ẳm": 155, "ẵm": 156, "ặm": 157, "ăn": 158, "ắn": 159, "ằn": 160, "ẳn": 161, "ẵn": 162, "ặn": 163, "ắp": 164, "ặp": 165, "ắt": 166, "ặt": 167, "ấc": 168, "ậc": 169, "âm": 170, "ấm": 171, "ầm": 172, "ẩm": 173, "ẫm": 174, "ậm": 175, "ân": 176, "ấn": 177, "ần": 178, "ẩn": 179, "ẫn": 180, "ận": 181, "ấp": 182, "ập": 183, "ất": 184, "ật": 185, "âu": 186, "ấu": 187, "ầu": 188, "ẩu": 189, "ẫu": 190, "ậu": 191, "ây": 192, "ấy": 193, "ầy": 194, "ẩy": 195, "ẫy": 196, "ậy": 197, "éc": 198, "ẹc": 199, "em": 200, "ém": 201, "èm": 202, "ẻm": 203, "ẽm": 204, "ẹm": 205, "en": 206, "én": 207, "èn": 208, "ẻn": 209, "ẽn": 210, "ẹn": 211, "eo": 212, "éo": 213, "èo": 214, "ẻo": 215, "ẽo": 216, "ẹo": 217, "ép": 218, "ẹp": 219, "ét": 220, "ẹt": 221, "êm": 222, "ếm": 223, "ềm": 224, "ễm": 225, "ệm": 226, "ên": 227, "ến": 228, "ền": 229, "ển": 230, "ện": 231, "ếp": 232, "ệp": 233, "ết": 234, "ệt": 235, "êu": 236, "ếu": 237, "ều": 238, "ểu": 239, "ễu": 240, "ệu": 241, "ia": 242, "ía": 243, "ìa": 244, "ỉa": 245, "ĩa": 246, "ịa": 247, "im": 248, "ím": 249, "ìm": 250, "ỉm": 251, "ĩm": 252, "ịm": 253, "in": 254, "ín": 255, "ìn": 256, "ỉn": 257, "ịn": 258, "íp": 259, "ịp": 260, "ít": 261, "ịt": 262, "iu": 263, "íu": 264, "ìu": 265, "ỉu": 266, "ĩu": 267, "ịu": 268, "oa": 269, "óa": 270, "òa": 271, "ỏa": 272, "õa": 273, "ọa": 274, "oà": 275, "óc": 276, "ọc": 277, "oe": 278, "óe": 279, "òe": 280, "ỏe": 281, "ọe": 282, "oẹ": 283, "oi": 284, "ói": 285, "òi": 286, "ỏi": 287, "õi": 288, "ọi": 289, "om": 290, "óm": 291, "òm": 292, "ỏm": 293, "õm": 294, "ọm": 295, "on": 296, "ón": 297, "òn": 298, "ỏn": 299, "õn": 300, "ọn": 301, "óp": 302, "ọp": 303, "ót": 304, "ọt": 305, "ốc": 306, "ộc": 307, "ôi": 308, "ối": 309, "ồi": 310, "ổi": 311, "ỗi": 312, "ội": 313, "ôm": 314, "ốm": 315, "ồm": 316, "ổm": 317, "ỗm": 318, "ộm": 319, "ôn": 320, "ốn": 321, "ồn": 322, "ổn": 323, "ỗn": 324, "ộn": 325, "ốp": 326, "ộp": 327, "ốt": 328, "ột": 329, "ơi": 330, "ới": 331, "ời": 332, "ởi": 333, "ỡi": 334, "ợi": 335, "ơm": 336, "ớm": 337, "ờm": 338, "ởm": 339, "ỡm": 340, "ợm": 341, "ơn": 342, "ớn": 343, "ờn": 344, "ởn": 345, "ỡn": 346, "ợn": 347, "ớp": 348, "ợp": 349, "ớt": 350, "ợt": 351, "ua": 352, "úa": 353, "ùa": 354, "ủa": 355, "ũa": 356, "ụa": 357, "úc": 358, "ục": 359, "uê": 360, "uế": 361, "uề": 362, "uể": 363, "uệ": 364, "ui": 365, "úi": 366, "ùi": 367, "ủi": 368, "ũi": 369, "ụi": 370, "um": 371, "úm": 372, "ùm": 373, "ủm": 374, "ũm": 375, "ụm": 376, "un": 377, "ún": 378, "ùn": 379, "ủn": 380, "ũn": 381, "ụn": 382, "úp": 383, "ụp": 384, "út": 385, "ụt": 386, "uy": 387, "úy": 388, "ùy": 389, "ủy": 390, "ũy": 391, "ụy": 392, "ưa": 393, "ứa": 394, "ừa": 395, "ửa": 396, "ữa": 397, "ựa": 398, "ức": 399, "ực": 400, "ửi": 401, "ừm": 402, "uơ": 403, "uở": 404, "ứt": 405, "ựt": 406, "ưu": 407, "ứu": 408, "ừu": 409, "ửu": 410, "ữu": 411, "ựu": 412, "sh": 413, "aw": 414, "ee": 415, "ea": 416, "ei": 417, "ew": 418, "eu": 419, "ie": 420, "oo": 421, "ou": 422, "ow": 423, "oy": 424, "ue": 425, "io": 426, "ách": 427, "ạch": 428, "ang": 429, "áng": 430, "àng": 431, "ảng": 432, "ãng": 433, "ạng": 434, "anh": 435, "ánh": 436, "ành": 437, "ảnh": 438, "ãnh": 439, "ạnh": 440, "ăng": 441, "ắng": 442, "ằng": 443, "ẳng": 444, "ẵng": 445, "ặng": 446, "âng": 447, "ấng": 448, "ầng": 449, "ẩng": 450, "ẫng": 451, "ậng": 452, "eng": 453, "éng": 454, "èng": 455, "ẻng": 456, "ếch": 457, "ệch": 458, "ênh": 459, "ếnh": 460, "ềnh": 461, "ểnh": 462, "ễnh": 463, "ệnh": 464, "ích": 465, "ịch": 466, "iếc": 467, "iệc": 468, "iêm": 469, "iếm": 470, "iềm": 471, "iểm": 472, "iễm": 473, "iệm": 474, "iên": 475, "iến": 476, "iền": 477, "iển": 478, "iễn": 479, "iện": 480, "iếp": 481, "iệp": 482, "iết": 483, "iệt": 484, "iêu": 485, "iếu": 486, "iều": 487, "iểu": 488, "iễu": 489, "iệu": 490, "inh": 491, "ính": 492, "ình": 493, "ỉnh": 494, "ĩnh": 495, "ịnh": 496, "oác": 497, "oạc": 498, "oai": 499, "oái": 500, "oài": 501, "oải": 502, "oãi": 503, "oại": 504, "oàm": 505, "oan": 506, "oán": 507, "oàn": 508, "oản": 509, "oãn": 510, "oạn": 511, "oao": 512, "oáo": 513, "oáp": 514, "oạp": 515, "oát": 516, "oạt": 517, "oay": 518, "oáy": 519, "oảy": 520, "oắc": 521, "oặc": 522, "oăm": 523, "oăn": 524, "oẳn": 525, "oắn": 526, "oằn": 527, "oắt": 528, "oặt": 529, "oen": 530, "oẻn": 531, "oeo": 532, "oéo": 533, "oèo": 534, "oẻo": 535, "oét": 536, "oẹt": 537, "ong": 538, "óng": 539, "òng": 540, "ỏng": 541, "õng": 542, "ọng": 543, "oóc": 544, "oọc": 545, "ông": 546, "ống": 547, "ồng": 548, "ổng": 549, "ỗng": 550, "ộng": 551, "uân": 552, "uấn": 553, "uần": 554, "uẩn": 555, "uẫn": 556, "uận": 557, "uất": 558, "uật": 559, "uây": 560, "uấy": 561, "uầy": 562, "ung": 563, "úng": 564, "ùng": 565, "ủng": 566, "ũng": 567, "ụng": 568, "uốc": 569, "uộc": 570, "uôi": 571, "uối": 572, "uồi": 573, "uổi": 574, "uỗi": 575, "uội": 576, "uôm": 577, "uốm": 578, "uồm": 579, "uỗm": 580, "uộm": 581, "uôn": 582, "uốn": 583, "uồn": 584, "uỗn": 585, "uộn": 586, "uốt": 587, "uột": 588, "uýt": 589, "uỵt": 590, "uya": 591, "uỷu": 592, "ưng": 593, "ứng": 594, "ừng": 595, "ửng": 596, "ững": 597, "ựng": 598, "ước": 599, "ược": 600, "ươi": 601, "ưới": 602, "ười": 603, "ưởi": 604, "ưỡi": 605, "ượi": 606, "ươm": 607, "ướm": 608, "ườm": 609, "ượm": 610, "ươn": 611, "ướn": 612, "ườn": 613, "ưỡn": 614, "ượn": 615, "ướp": 616, "ượp": 617, "ướt": 618, "ượt": 619, "ươu": 620, "ướu": 621, "ượu": 622, "yêm": 623, "yếm": 624, "yểm": 625, "yên": 626, "yến": 627, "yêu": 628, "yếu": 629, "yểu": 630, "yết": 631, "iêng": 632, "iếng": 633, "iềng": 634, "iểng": 635, "iễng": 636, "iệng": 637, "oách": 638, "oạch": 639, "oang": 640, "oáng": 641, "oàng": 642, "oảng": 643, "oãng": 644, "oạng": 645, "oanh": 646, "oánh": 647, "oành": 648, "oạnh": 649, "oảnh": 650, "oăng": 651, "oắng": 652, "oằng": 653, "oẳng": 654, "oong": 655, "uếch": 656, "uênh": 657, "uông": 658, "uống": 659, "uồng": 660, "uổng": 661, "uỗng": 662, "uộng": 663, "uých": 664, "uỵch": 665, "uyên": 666, "uyến": 667, "uyền": 668, "uyển": 669, "uyễn": 670, "uyện": 671, "uyết": 672, "uyệt": 673, "uynh": 674, "uỳnh": 675, "uýnh": 676, "uỷnh": 677, "ương": 678, "ướng": 679, "ường": 680, "ưởng": 681, "ưỡng": 682, "ượng": 683, "op": 684, "ot": 685, "gi": 686, "ap": 687, "at": 688, "ac": 689, "it": 690, "ip": 691, "ic": 692, "ep": 693, "et": 694, "ec": 695 }
|