datnth1709 commited on
Commit
e8eafba
1 Parent(s): 607348e

update speech2text module

Browse files
Files changed (7) hide show
  1. .gitignore +1 -4
  2. app.py +93 -6
  3. packages.txt +1 -0
  4. requirements.txt +9 -1
  5. vi_speech_01.wav +0 -0
  6. vi_speech_02.wav +0 -0
  7. vi_speech_03.wav +0 -0
.gitignore CHANGED
@@ -4,7 +4,4 @@ __pycache__
4
  .git
5
  .vs
6
  .vscode
7
- .ipynb_checkpoints
8
-
9
- # Except this file
10
- *.pbf
 
4
  .git
5
  .vs
6
  .vscode
7
+ .ipynb_checkpoints
 
 
 
app.py CHANGED
@@ -1,15 +1,102 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  model_checkpoint = "huynguyen208/fantastic4-finetuned-vi-to-en-PhoMT-demo"
5
  translator = pipeline("translation", model=model_checkpoint)
6
 
7
  def translate(Vietnamese):
8
  return translator(Vietnamese)[0]['translation_text']
9
 
10
- iface = gr.Interface(fn=translate,
11
- inputs=["text"],
12
- outputs="text",
13
- title = 'Translate Vietnamese to English',
14
- description = 'Mini Translator')
15
- iface.launch(inline = False)
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
 
4
+ import gradio as gr
5
+ from transformers.file_utils import cached_path, hf_bucket_url
6
+ import os, zipfile
7
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
8
+ from datasets import load_dataset
9
+ import torch
10
+ import kenlm
11
+ import torchaudio
12
+ from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
13
+
14
+ """Vietnamese speech2text"""
15
+ cache_dir = './cache/'
16
+ processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
17
+ model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
18
+ lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
19
+ lm_file = cached_path(lm_file,cache_dir=cache_dir)
20
+ with zipfile.ZipFile(lm_file, 'r') as zip_ref:
21
+ zip_ref.extractall(cache_dir)
22
+ lm_file = cache_dir + 'vi_lm_4grams.bin'\
23
+
24
+ def get_decoder_ngram_model(tokenizer, ngram_lm_path):
25
+ vocab_dict = tokenizer.get_vocab()
26
+ sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
27
+ vocab = [x[1] for x in sort_vocab][:-2]
28
+ vocab_list = vocab
29
+ # convert ctc blank character representation
30
+ vocab_list[tokenizer.pad_token_id] = ""
31
+ # replace special characters
32
+ vocab_list[tokenizer.unk_token_id] = ""
33
+ # vocab_list[tokenizer.bos_token_id] = ""
34
+ # vocab_list[tokenizer.eos_token_id] = ""
35
+ # convert space character representation
36
+ vocab_list[tokenizer.word_delimiter_token_id] = " "
37
+ # specify ctc blank char index, since conventially it is the last entry of the logit matrix
38
+ alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
39
+ lm_model = kenlm.Model(ngram_lm_path)
40
+ decoder = BeamSearchDecoderCTC(alphabet,
41
+ language_model=LanguageModel(lm_model))
42
+ return decoder
43
+ ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)
44
+
45
+ # define function to read in sound file
46
+ def speech_file_to_array_fn(path, max_seconds=10):
47
+ batch = {"file": path}
48
+ speech_array, sampling_rate = torchaudio.load(batch["file"])
49
+ if sampling_rate != 16000:
50
+ transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
51
+ new_freq=16000)
52
+ speech_array = transform(speech_array)
53
+ speech_array = speech_array[0]
54
+ if max_seconds > 0:
55
+ speech_array = speech_array[:max_seconds*16000]
56
+ batch["speech"] = speech_array.numpy()
57
+ batch["sampling_rate"] = 16000
58
+ return batch
59
+ # tokenize
60
+ def speech2text(audio):
61
+ # read in sound file
62
+ # load dummy dataset and read soundfiles
63
+ ds = speech_file_to_array_fn(audio.name)
64
+ # infer model
65
+ input_values = processor(
66
+ ds["speech"],
67
+ sampling_rate=ds["sampling_rate"],
68
+ return_tensors="pt"
69
+ ).input_values
70
+ # decode ctc output
71
+ logits = model(input_values).logits[0]
72
+ pred_ids = torch.argmax(logits, dim=-1)
73
+ greedy_search_output = processor.decode(pred_ids)
74
+ beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
75
+ return beam_search_output
76
+
77
+
78
+
79
+ """Machine translation"""
80
  model_checkpoint = "huynguyen208/fantastic4-finetuned-vi-to-en-PhoMT-demo"
81
  translator = pipeline("translation", model=model_checkpoint)
82
 
83
  def translate(Vietnamese):
84
  return translator(Vietnamese)[0]['translation_text']
85
 
86
+ def inference(audio):
87
+ vi_text = speech2text(audio)
88
+ en_text = translate(vi_text)
89
+ return en_text
90
+
91
+ inputs = gr.inputs.Audio(label="Input Audio", type="file")
92
+ outputs = gr.outputs.Textbox(label="Output Text")
93
+ title = "Speech to text and translate Vietnamese to English"
94
+ description = "Gradio demo for a wav2vec2-base-vietnamese-250h and Helsinki-NLP/opus-mt-vi-en"
95
+ examples=[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
96
+ iface = gr.Interface(inference,
97
+ inputs,
98
+ outputs,
99
+ title=title,
100
+ description=description,
101
+ examples=examples)
102
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libsndfile1
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
  torch
 
 
 
 
 
 
2
  gradio
3
- transformers[sentencepiece]
 
 
 
1
  torch
2
+ torchaudio
3
+ speechbrain
4
+ pydub
5
+ datasets
6
+ soundfile
7
+ ffmpeg-python
8
  gradio
9
+ transformers
10
+ transformers[sentencepiece]
11
+ https://github.com/kpu/kenlm/archive/master.zip
vi_speech_01.wav ADDED
Binary file (120 kB). View file
 
vi_speech_02.wav ADDED
Binary file (49.6 kB). View file
 
vi_speech_03.wav ADDED
Binary file (76.8 kB). View file