Spaces:
Runtime error
Runtime error
Binh Nguyen
commited on
Commit
β’
9cdb89a
1
Parent(s):
d2ac42f
add app
Browse files- app.py +73 -0
- requirements.txt +6 -0
- t1_0001-00010.wav +0 -0
- t1_utt000000042.wav +0 -0
- t2_0000006682.wav +0 -0
app.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers.file_utils import cached_path, hf_bucket_url
|
3 |
+
import os, zipfile
|
4 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
5 |
+
from datasets import load_dataset
|
6 |
+
import soundfile as sf
|
7 |
+
import torch
|
8 |
+
import kenlm
|
9 |
+
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
|
10 |
+
|
11 |
+
cache_dir = './cache/'
|
12 |
+
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
|
13 |
+
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
|
14 |
+
lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
|
15 |
+
lm_file = cached_path(lm_file,cache_dir=cache_dir)
|
16 |
+
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
|
17 |
+
zip_ref.extractall(cache_dir)
|
18 |
+
lm_file = cache_dir + 'vi_lm_4grams.bin'
|
19 |
+
|
20 |
+
|
21 |
+
def get_decoder_ngram_model(tokenizer, ngram_lm_path):
|
22 |
+
vocab_dict = tokenizer.get_vocab()
|
23 |
+
sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
|
24 |
+
vocab = [x[1] for x in sort_vocab][:-2]
|
25 |
+
vocab_list = vocab
|
26 |
+
# convert ctc blank character representation
|
27 |
+
vocab_list[tokenizer.pad_token_id] = ""
|
28 |
+
# replace special characters
|
29 |
+
vocab_list[tokenizer.unk_token_id] = ""
|
30 |
+
# vocab_list[tokenizer.bos_token_id] = ""
|
31 |
+
# vocab_list[tokenizer.eos_token_id] = ""
|
32 |
+
# convert space character representation
|
33 |
+
vocab_list[tokenizer.word_delimiter_token_id] = " "
|
34 |
+
# specify ctc blank char index, since conventially it is the last entry of the logit matrix
|
35 |
+
alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
|
36 |
+
lm_model = kenlm.Model(ngram_lm_path)
|
37 |
+
decoder = BeamSearchDecoderCTC(alphabet,
|
38 |
+
language_model=LanguageModel(lm_model))
|
39 |
+
return decoder
|
40 |
+
|
41 |
+
ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)
|
42 |
+
|
43 |
+
# define function to read in sound file
|
44 |
+
def map_to_array(batch):
|
45 |
+
speech, sampling_rate = sf.read(batch["file"])
|
46 |
+
batch["speech"] = speech
|
47 |
+
batch["sampling_rate"] = sampling_rate
|
48 |
+
return batch
|
49 |
+
|
50 |
+
# tokenize
|
51 |
+
def inference(audio):
|
52 |
+
# read in sound file
|
53 |
+
# load dummy dataset and read soundfiles
|
54 |
+
ds = map_to_array({"file": audio})
|
55 |
+
# infer model
|
56 |
+
input_values = processor(
|
57 |
+
ds["speech"],
|
58 |
+
sampling_rate=ds["sampling_rate"],
|
59 |
+
return_tensors="pt"
|
60 |
+
).input_values
|
61 |
+
# decode ctc output
|
62 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
63 |
+
greedy_search_output = processor.decode(pred_ids)
|
64 |
+
beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
|
65 |
+
return beam_search_output
|
66 |
+
|
67 |
+
inputs = gr.inputs.Audio(label="Input Audio", type="file")
|
68 |
+
outputs = gr.outputs.Textbox(label="Output Text")
|
69 |
+
title = "wav2vec2-base-vietnamese-250h"
|
70 |
+
description = "Gradio demo for a wav2vec2-base-vietnamese-250h. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files"
|
71 |
+
article = "<p style='text-align: center'><a href='https://github.com/vietai/ASR' target='_blank'> Github repo for demonstration </a> | <a href='https://huggingface.co/nguyenvulebinh/wav2vec2-base-vietnamese-250h' target='_blank'>Pretrained model</a></p>"
|
72 |
+
examples=[['t1_0001-00010.wav', 't1_utt000000042.wav', 't2_0000006682.wav']]
|
73 |
+
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.9.0
|
2 |
+
transformers==4.9.2
|
3 |
+
soundfile
|
4 |
+
datasets==1.11.0
|
5 |
+
pyctcdecode==v0.1.0
|
6 |
+
https://github.com/kpu/kenlm/archive/master.zip
|
t1_0001-00010.wav
ADDED
Binary file (120 kB). View file
|
|
t1_utt000000042.wav
ADDED
Binary file (76.8 kB). View file
|
|
t2_0000006682.wav
ADDED
Binary file (49.6 kB). View file
|
|