HuyenNguyen commited on
Commit
b84e0e6
1 Parent(s): ba51655

Upload 10 files

Browse files
README.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ language:
4
+ - vi
5
+ ---
6
+
7
+ ### Vietnamese ASR sequence-to-sequence model. This model supports output normalizing text, labeling timestamps, and segmenting multiple speakers.
8
+
9
+
10
+ ```python
11
+ # !pip install transformers, sentencepiece
12
+
13
+ from transformers import SpeechEncoderDecoderModel
14
+ from transformers import AutoFeatureExtractor, AutoTokenizer, GenerationConfig
15
+ import torchaudio
16
+ import torch
17
+
18
+ model_path = 'nguyenvulebinh/wav2vec2-bartpho'
19
+ model = SpeechEncoderDecoderModel.from_pretrained(model_path).eval()
20
+ feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
21
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
22
+ if torch.cuda.is_available():
23
+ model = model.cuda()
24
+
25
+
26
+ def decode_tokens(token_ids, skip_special_tokens=True, time_precision=0.02):
27
+ timestamp_begin = tokenizer.vocab_size
28
+ outputs = [[]]
29
+ for token in token_ids:
30
+ if token >= timestamp_begin:
31
+ timestamp = f" |{(token - timestamp_begin) * time_precision:.2f}| "
32
+ outputs.append(timestamp)
33
+ outputs.append([])
34
+ else:
35
+ outputs[-1].append(token)
36
+ outputs = [
37
+ s if isinstance(s, str) else tokenizer.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
38
+ ]
39
+ return "".join(outputs).replace("< |", "<|").replace("| >", "|>")
40
+
41
+ def decode_wav(audio_wavs, asr_model, prefix=""):
42
+ device = next(asr_model.parameters()).device
43
+ input_values = feature_extractor.pad(
44
+ [{"input_values": feature} for feature in audio_wavs],
45
+ padding=True,
46
+ max_length=None,
47
+ pad_to_multiple_of=None,
48
+ return_tensors="pt",
49
+ )
50
+
51
+ output_beam_ids = asr_model.generate(
52
+ input_values['input_values'].to(device),
53
+ attention_mask=input_values['attention_mask'].to(device),
54
+ decoder_input_ids=tokenizer.batch_encode_plus([prefix] * len(audio_wavs), return_tensors="pt")['input_ids'][..., :-1].to(device),
55
+ generation_config=GenerationConfig(decoder_start_token_id=tokenizer.bos_token_id),
56
+ max_length=250,
57
+ num_beams=25,
58
+ no_repeat_ngram_size=4,
59
+ num_return_sequences=1,
60
+ early_stopping=True,
61
+ return_dict_in_generate=True,
62
+ output_scores=True,
63
+ )
64
+
65
+ output_text = [decode_tokens(sequence) for sequence in output_beam_ids.sequences]
66
+
67
+ return output_text
68
+
69
+
70
+ # https://huggingface.co/nguyenvulebinh/wav2vec2-bartpho/resolve/main/sample_news.wav
71
+ print(decode_wav([torchaudio.load('sample_news.wav')[0].squeeze()], model))
72
+
73
+ # <|0.00| Gia đình cho biết, nhiều lần đã từng gọi điện báo chính quyền và lực lượng an ninh địa phương nhưng đều không có tác dụng |7.00|>
74
+ # <|8.14| Không ai giúp đỡ được mình một chút nào cả, nên là lúc đó là lúc tuyệt vọng nhất, nó tra tấn mình cực kỳ khổ, gây cái tâm lý ức chế rất là nhiều, rất là lớn |19.02|>
75
+
76
+ ```
77
+
78
+ ### Citation
79
+
80
+ This repository uses the idea from the following paper. Please cite the paper if this model is used to help produce published results or is incorporated into other software.
81
+
82
+ ```text
83
+ @INPROCEEDINGS{10446589,
84
+ author={Nguyen, Thai-Binh and Waibel, Alexander},
85
+ booktitle={ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
86
+ title={Synthetic Conversations Improve Multi-Talker ASR},
87
+ year={2024},
88
+ volume={},
89
+ number={},
90
+ pages={10461-10465},
91
+ keywords={Systematics;Error analysis;Knowledge based systems;Oral communication;Signal processing;Data models;Acoustics;multi-talker;asr;synthetic conversation},
92
+ doi={10.1109/ICASSP48485.2024.10446589}
93
+ }
94
+ ```
95
+
96
+ ### Contact
97
+
98
+ nguyenvulebinh@gmail.com
99
+
100
+ [![Follow](https://img.shields.io/twitter/follow/nguyenvulebinh?style=social)](https://twitter.com/intent/follow?screen_name=nguyenvulebinh)
config.json ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "SpeechEncoderDecoderModel"
5
+ ],
6
+ "decoder": {
7
+ "_name_or_path": "vinai/bartpho-syllable-base",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "add_cross_attention": true,
11
+ "architectures": [
12
+ "MBartModel"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "bad_words_ids": null,
16
+ "begin_suppress_tokens": null,
17
+ "bos_token_id": 0,
18
+ "chunk_size_feed_forward": 0,
19
+ "classifier_dropout": 0.0,
20
+ "cross_attention_hidden_size": null,
21
+ "d_model": 768,
22
+ "decoder_attention_heads": 12,
23
+ "decoder_ffn_dim": 3072,
24
+ "decoder_layerdrop": 0.0,
25
+ "decoder_layers": 6,
26
+ "decoder_start_token_id": 2,
27
+ "diversity_penalty": 0.0,
28
+ "do_sample": false,
29
+ "dropout": 0.1,
30
+ "early_stopping": false,
31
+ "encoder_attention_heads": 12,
32
+ "encoder_ffn_dim": 3072,
33
+ "encoder_layerdrop": 0.0,
34
+ "encoder_layers": 6,
35
+ "encoder_no_repeat_ngram_size": 0,
36
+ "eos_token_id": 2,
37
+ "exponential_decay_length_penalty": null,
38
+ "finetuning_task": null,
39
+ "forced_bos_token_id": null,
40
+ "forced_eos_token_id": 2,
41
+ "gradient_checkpointing": false,
42
+ "id2label": {
43
+ "0": "LABEL_0",
44
+ "1": "LABEL_1"
45
+ },
46
+ "init_std": 0.02,
47
+ "is_decoder": true,
48
+ "is_encoder_decoder": true,
49
+ "label2id": {
50
+ "LABEL_0": 0,
51
+ "LABEL_1": 1
52
+ },
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "max_position_embeddings": 1024,
56
+ "min_length": 0,
57
+ "model_type": "mbart",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_hidden_layers": 6,
62
+ "num_return_sequences": 1,
63
+ "output_attentions": false,
64
+ "output_hidden_states": false,
65
+ "output_scores": false,
66
+ "pad_token_id": 1,
67
+ "prefix": null,
68
+ "problem_type": null,
69
+ "pruned_heads": {},
70
+ "remove_invalid_values": false,
71
+ "repetition_penalty": 1.0,
72
+ "return_dict": true,
73
+ "return_dict_in_generate": false,
74
+ "scale_embedding": false,
75
+ "sep_token_id": null,
76
+ "suppress_tokens": null,
77
+ "task_specific_params": null,
78
+ "temperature": 1.0,
79
+ "tf_legacy_loss": false,
80
+ "tie_encoder_decoder": false,
81
+ "tie_word_embeddings": true,
82
+ "tokenizer_class": "BartphoTokenizer",
83
+ "top_k": 50,
84
+ "top_p": 1.0,
85
+ "torch_dtype": "float32",
86
+ "torchscript": false,
87
+ "transformers_version": "4.30.2",
88
+ "typical_p": 1.0,
89
+ "use_bfloat16": false,
90
+ "use_cache": true,
91
+ "vocab_size": 41031
92
+ },
93
+ "encoder": {
94
+ "_name_or_path": "nguyenvulebinh/wav2vec2-large-vi",
95
+ "activation_dropout": 0.0,
96
+ "adapter_attn_dim": null,
97
+ "adapter_kernel_size": 3,
98
+ "adapter_stride": 2,
99
+ "add_adapter": true,
100
+ "add_cross_attention": false,
101
+ "apply_spec_augment": true,
102
+ "architectures": [
103
+ "Wav2Vec2ForPreTraining"
104
+ ],
105
+ "attention_dropout": 0.1,
106
+ "bad_words_ids": null,
107
+ "begin_suppress_tokens": null,
108
+ "bos_token_id": 1,
109
+ "chunk_size_feed_forward": 0,
110
+ "classifier_proj_size": 256,
111
+ "codevector_dim": 768,
112
+ "contrastive_logits_temperature": 0.1,
113
+ "conv_bias": true,
114
+ "conv_dim": [
115
+ 512,
116
+ 512,
117
+ 512,
118
+ 512,
119
+ 512,
120
+ 512,
121
+ 512
122
+ ],
123
+ "conv_kernel": [
124
+ 10,
125
+ 3,
126
+ 3,
127
+ 3,
128
+ 3,
129
+ 2,
130
+ 2
131
+ ],
132
+ "conv_stride": [
133
+ 5,
134
+ 2,
135
+ 2,
136
+ 2,
137
+ 2,
138
+ 2,
139
+ 2
140
+ ],
141
+ "cross_attention_hidden_size": null,
142
+ "ctc_loss_reduction": "sum",
143
+ "ctc_zero_infinity": false,
144
+ "decoder_start_token_id": null,
145
+ "diversity_loss_weight": 0.1,
146
+ "diversity_penalty": 0.0,
147
+ "do_sample": false,
148
+ "do_stable_layer_norm": true,
149
+ "early_stopping": false,
150
+ "encoder_no_repeat_ngram_size": 0,
151
+ "eos_token_id": 2,
152
+ "exponential_decay_length_penalty": null,
153
+ "feat_extract_activation": "gelu",
154
+ "feat_extract_dropout": 0.0,
155
+ "feat_extract_norm": "layer",
156
+ "feat_proj_dropout": 0.1,
157
+ "feat_quantizer_dropout": 0.0,
158
+ "final_dropout": 0.0,
159
+ "finetuning_task": null,
160
+ "forced_bos_token_id": null,
161
+ "forced_eos_token_id": null,
162
+ "gradient_checkpointing": false,
163
+ "hidden_act": "gelu",
164
+ "hidden_dropout": 0.1,
165
+ "hidden_size": 1024,
166
+ "id2label": {
167
+ "0": "LABEL_0",
168
+ "1": "LABEL_1"
169
+ },
170
+ "initializer_range": 0.02,
171
+ "intermediate_size": 4096,
172
+ "is_decoder": false,
173
+ "is_encoder_decoder": false,
174
+ "label2id": {
175
+ "LABEL_0": 0,
176
+ "LABEL_1": 1
177
+ },
178
+ "layer_norm_eps": 1e-05,
179
+ "layerdrop": 0.1,
180
+ "length_penalty": 1.0,
181
+ "mask_channel_length": 10,
182
+ "mask_channel_min_space": 1,
183
+ "mask_channel_other": 0.0,
184
+ "mask_channel_prob": 0.0,
185
+ "mask_channel_selection": "static",
186
+ "mask_feature_length": 10,
187
+ "mask_feature_min_masks": 0,
188
+ "mask_feature_prob": 0.0,
189
+ "mask_time_length": 10,
190
+ "mask_time_min_masks": 2,
191
+ "mask_time_min_space": 1,
192
+ "mask_time_other": 0.0,
193
+ "mask_time_prob": 0.075,
194
+ "mask_time_selection": "static",
195
+ "max_length": 20,
196
+ "min_length": 0,
197
+ "model_type": "wav2vec2",
198
+ "no_repeat_ngram_size": 0,
199
+ "num_adapter_layers": 3,
200
+ "num_attention_heads": 16,
201
+ "num_beam_groups": 1,
202
+ "num_beams": 1,
203
+ "num_codevector_groups": 2,
204
+ "num_codevectors_per_group": 320,
205
+ "num_conv_pos_embedding_groups": 16,
206
+ "num_conv_pos_embeddings": 128,
207
+ "num_feat_extract_layers": 7,
208
+ "num_hidden_layers": 24,
209
+ "num_negatives": 100,
210
+ "num_return_sequences": 1,
211
+ "output_attentions": false,
212
+ "output_hidden_size": 1024,
213
+ "output_hidden_states": false,
214
+ "output_scores": false,
215
+ "pad_token_id": 0,
216
+ "prefix": null,
217
+ "problem_type": null,
218
+ "proj_codevector_dim": 768,
219
+ "pruned_heads": {},
220
+ "remove_invalid_values": false,
221
+ "repetition_penalty": 1.0,
222
+ "return_dict": true,
223
+ "return_dict_in_generate": false,
224
+ "sep_token_id": null,
225
+ "suppress_tokens": null,
226
+ "task_specific_params": null,
227
+ "tdnn_dilation": [
228
+ 1,
229
+ 2,
230
+ 3,
231
+ 1,
232
+ 1
233
+ ],
234
+ "tdnn_dim": [
235
+ 512,
236
+ 512,
237
+ 512,
238
+ 512,
239
+ 1500
240
+ ],
241
+ "tdnn_kernel": [
242
+ 5,
243
+ 3,
244
+ 3,
245
+ 1,
246
+ 1
247
+ ],
248
+ "temperature": 1.0,
249
+ "tf_legacy_loss": false,
250
+ "tie_encoder_decoder": false,
251
+ "tie_word_embeddings": true,
252
+ "tokenizer_class": null,
253
+ "top_k": 50,
254
+ "top_p": 1.0,
255
+ "torch_dtype": "float32",
256
+ "torchscript": false,
257
+ "transformers_version": "4.30.2",
258
+ "typical_p": 1.0,
259
+ "use_bfloat16": false,
260
+ "use_weighted_layer_sum": false,
261
+ "vocab_size": 96,
262
+ "xvector_output_dim": 512
263
+ },
264
+ "is_encoder_decoder": true,
265
+ "model_type": "speech-encoder-decoder",
266
+ "tie_word_embeddings": false,
267
+ "torch_dtype": "float32",
268
+ "transformers_version": null
269
+ }
dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.30.2"
9
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27dc020061122a4c4d162f08ca41ba37d62525b82b4ab5309898c9a268f63f61
3
+ size 135
sample_news.wav ADDED
Binary file (609 kB). View file
 
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8a54190d2b9256881ed34ab5428786629f929dd5a579350a6ef4735b86a9208
3
+ size 132
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 1000000000000000019884624838656,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "sp_model_kwargs": {},
18
+ "tokenizer_class": "BartphoTokenizer",
19
+ "unk_token": "<unk>"
20
+ }