Binh Nguyen commited on
Commit
9cdb89a
β€’
1 Parent(s): d2ac42f
Files changed (5) hide show
  1. app.py +73 -0
  2. requirements.txt +6 -0
  3. t1_0001-00010.wav +0 -0
  4. t1_utt000000042.wav +0 -0
  5. t2_0000006682.wav +0 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers.file_utils import cached_path, hf_bucket_url
3
+ import os, zipfile
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ from datasets import load_dataset
6
+ import soundfile as sf
7
+ import torch
8
+ import kenlm
9
+ from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
10
+
11
+ cache_dir = './cache/'
12
+ processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
13
+ model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
14
+ lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
15
+ lm_file = cached_path(lm_file,cache_dir=cache_dir)
16
+ with zipfile.ZipFile(lm_file, 'r') as zip_ref:
17
+ zip_ref.extractall(cache_dir)
18
+ lm_file = cache_dir + 'vi_lm_4grams.bin'
19
+
20
+
21
+ def get_decoder_ngram_model(tokenizer, ngram_lm_path):
22
+ vocab_dict = tokenizer.get_vocab()
23
+ sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
24
+ vocab = [x[1] for x in sort_vocab][:-2]
25
+ vocab_list = vocab
26
+ # convert ctc blank character representation
27
+ vocab_list[tokenizer.pad_token_id] = ""
28
+ # replace special characters
29
+ vocab_list[tokenizer.unk_token_id] = ""
30
+ # vocab_list[tokenizer.bos_token_id] = ""
31
+ # vocab_list[tokenizer.eos_token_id] = ""
32
+ # convert space character representation
33
+ vocab_list[tokenizer.word_delimiter_token_id] = " "
34
+ # specify ctc blank char index, since conventially it is the last entry of the logit matrix
35
+ alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
36
+ lm_model = kenlm.Model(ngram_lm_path)
37
+ decoder = BeamSearchDecoderCTC(alphabet,
38
+ language_model=LanguageModel(lm_model))
39
+ return decoder
40
+
41
+ ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)
42
+
43
+ # define function to read in sound file
44
+ def map_to_array(batch):
45
+ speech, sampling_rate = sf.read(batch["file"])
46
+ batch["speech"] = speech
47
+ batch["sampling_rate"] = sampling_rate
48
+ return batch
49
+
50
+ # tokenize
51
+ def inference(audio):
52
+ # read in sound file
53
+ # load dummy dataset and read soundfiles
54
+ ds = map_to_array({"file": audio})
55
+ # infer model
56
+ input_values = processor(
57
+ ds["speech"],
58
+ sampling_rate=ds["sampling_rate"],
59
+ return_tensors="pt"
60
+ ).input_values
61
+ # decode ctc output
62
+ pred_ids = torch.argmax(logits, dim=-1)
63
+ greedy_search_output = processor.decode(pred_ids)
64
+ beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
65
+ return beam_search_output
66
+
67
+ inputs = gr.inputs.Audio(label="Input Audio", type="file")
68
+ outputs = gr.outputs.Textbox(label="Output Text")
69
+ title = "wav2vec2-base-vietnamese-250h"
70
+ description = "Gradio demo for a wav2vec2-base-vietnamese-250h. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files"
71
+ article = "<p style='text-align: center'><a href='https://github.com/vietai/ASR' target='_blank'> Github repo for demonstration </a> | <a href='https://huggingface.co/nguyenvulebinh/wav2vec2-base-vietnamese-250h' target='_blank'>Pretrained model</a></p>"
72
+ examples=[['t1_0001-00010.wav', 't1_utt000000042.wav', 't2_0000006682.wav']]
73
+ gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch==1.9.0
2
+ transformers==4.9.2
3
+ soundfile
4
+ datasets==1.11.0
5
+ pyctcdecode==v0.1.0
6
+ https://github.com/kpu/kenlm/archive/master.zip
t1_0001-00010.wav ADDED
Binary file (120 kB). View file
 
t1_utt000000042.wav ADDED
Binary file (76.8 kB). View file
 
t2_0000006682.wav ADDED
Binary file (49.6 kB). View file