Spaces:

datnth1709
/

Realtime-Translation

Runtime error

App Files Files Community

Realtime-Translation / app.py

datnth1709

update inference

2f12a3f about 2 years ago

raw

history blame

5.09 kB

	import gradio as gr
	import nltk
	import librosa

	from transformers import pipeline, TranslationPipeline, AutoTokenizer, TranslationPipeline
	from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer
	from transformers.file_utils import cached_path, hf_bucket_url
	import os, zipfile
	from datasets import load_dataset
	import torch
	import kenlm
	import torchaudio
	from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel

	"""Vietnamese speech2text"""
	cache_dir = './cache/'
	processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
	vi_model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
	lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
	lm_file = cached_path(lm_file,cache_dir=cache_dir)
	with zipfile.ZipFile(lm_file, 'r') as zip_ref:
	zip_ref.extractall(cache_dir)
	lm_file = cache_dir + 'vi_lm_4grams.bin'

	def get_decoder_ngram_model(tokenizer, ngram_lm_path):
	vocab_dict = tokenizer.get_vocab()
	sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
	vocab = [x[1] for x in sort_vocab][:-2]
	vocab_list = vocab
	# convert ctc blank character representation
	vocab_list[tokenizer.pad_token_id] = ""
	# replace special characters
	vocab_list[tokenizer.unk_token_id] = ""
	# vocab_list[tokenizer.bos_token_id] = ""
	# vocab_list[tokenizer.eos_token_id] = ""
	# convert space character representation
	vocab_list[tokenizer.word_delimiter_token_id] = " "
	# specify ctc blank char index, since conventially it is the last entry of the logit matrix
	alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
	lm_model = kenlm.Model(ngram_lm_path)
	decoder = BeamSearchDecoderCTC(alphabet,
	language_model=LanguageModel(lm_model))
	return decoder
	ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)

	# define function to read in sound file
	def speech_file_to_array_fn(path, max_seconds=10):
	batch = {"file": path}
	speech_array, sampling_rate = torchaudio.load(batch["file"])
	if sampling_rate != 16000:
	transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
	new_freq=16000)
	speech_array = transform(speech_array)
	speech_array = speech_array[0]
	if max_seconds > 0:
	speech_array = speech_array[:max_seconds*16000]
	batch["speech"] = speech_array.numpy()
	batch["sampling_rate"] = 16000
	return batch

	# tokenize
	def speech2text_vi(audio):
	# read in sound file
	# load dummy dataset and read soundfiles
	ds = speech_file_to_array_fn(audio.name)
	# infer model
	input_values = processor(
	ds["speech"],
	sampling_rate=ds["sampling_rate"],
	return_tensors="pt"
	).input_values
	# decode ctc output
	logits = vi_model(input_values).logits[0]
	pred_ids = torch.argmax(logits, dim=-1)
	greedy_search_output = processor.decode(pred_ids)
	beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
	return beam_search_output


	"""Machine translation"""
	vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
	vien_translator = pipeline("translation", model=vien_model_checkpoint)

	def translate_vi2en(Vietnamese):
	return vien_translator(Vietnamese)[0]['translation_text']


	""" Inference"""
	def inference_vien(audio):
	vi_text = speech2text_vi(audio)
	en_text = translate_vi2en(vi_text)
	return vi_text, en_text

	def transcribe_vi_1(audio, state_en=""):
	ds = speech_file_to_array_fn(audio.name)
	# infer model
	input_values = processor(
	ds["speech"],
	sampling_rate=ds["sampling_rate"],
	return_tensors="pt"
	).input_values
	# decode ctc output
	logits = vi_model(input_values).logits[0]
	pred_ids = torch.argmax(logits, dim=-1)
	greedy_search_output = processor.decode(pred_ids)
	beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
	en_text = translate_vi2en(beam_search_output)
	state_en += en_text + " "
	return state_en, state_en

	"""Gradio demo"""
	vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
	"Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
	"Nếu như một câu nói có thể khiến em vui."]
	vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]

	with gr.TabItem("Vi-En Realtime Translation"):
	gr.Interface(
	fn=transcribe_vi_1,
	inputs=[
	gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
	"state",
	],
	outputs= [
	"text",
	"state",

	],
	examples=vi_example_voice,
	live=True).launch()