Spaces:

juulaii
/

En2Es-Speech-Translator

Runtime error

App Files Files Community

En2Es-Speech-Translator / app.py

juulaii

Update app.py

64a4879 over 2 years ago

raw

history blame

4.35 kB

	import gradio as gr

	#Get models
	#ASR model for input speech
	import torch
	from transformers import Wav2Vec2Processor, HubertForCTC
	from datasets import load_dataset
	import soundfile as sf

	processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
	model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")

	def map_to_array(batch):
	speech, _ = sf.read(batch["file"])
	batch["speech"] = speech
	return batch

	ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
	ds = ds.map(map_to_array)

	input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0])

	#speech2text = gr.Interface.load("huggingface/facebook/hubert-large-ls960-ft",
	# inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"))
	speech2text = gr.Interface.(transcription,
	inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"))
	#translates english to spanish text
	translator = gr.Interface.load("huggingface/Helsinki-NLP/opus-mt-en-es",
	input=transcription
	outputs=gr.outputs.Textbox(label="English to Spanish Translated Text"))
	#TTS model for output speech
	text2speech = gr.Interface.load("huggingface/facebook/tts_transformer-es-css10",
	outputs=gr.outputs.Audio(label="English to Spanish Translated Audio"),
	allow_flagging="never")


	translate = gr.Series(speech2text, translator) #outputs Spanish text translation
	en2es = gr.Series(translate, text2speech) #outputs Spanish audio
	ui = gr.Parallel(translate, en2es) #allows transcription of Spanish audio

	#gradio interface
	ui.title = "English to Spanish Speech Translator"
	ui.description = """<center>A useful tool in translating English to Spanish audio. All pre-trained models are found in huggingface.</center>"""
	ui.examples = [['ljspeech.wav'],['ljspeech2.wav',]]
	ui.theme = "peach"
	ui.article = """<h2>Pre-trained model Information</h2>
	<h3>Automatic Speech Recognition</h3>
	<p style='text-align: justify'>The model used for the ASR part of this space is from
	[https://huggingface.co/facebook/hubert-large-ls960-ft] which is pretrained and fine-tuned on <b>960 hours of
	Librispeech</b> on 16kHz sampled speech audio. This model has a self-reported <b>word error rate (WER)</b> of <b>1.9
	percent</b> and ranks first in <i>paperswithcode</i> for ASR on Librispeech. More information can be
	found on its website at [https://ai.facebook.com/blog/hubert-self-supervised-representation-learning-for-speech-recognition-
	generation-and-compression] and
	original model is under [https://github.com/pytorch/fairseq/tree/main/examples/hubert].</p>
	<h3>Text Translator</h3>
	<p style='text-align: justify'>The English to Spanish text translator pre-trained model is from
	[https://huggingface.co/Helsinki-NLP/opus-mt-en-es] which is part of the <b>The Tatoeba Translation Challenge
	(v2021-08-07)</b> as seen from its github repo at
	[https://github.com/Helsinki-NLP/Tatoeba-Challenge]. This project aims to develop machine
	translation in real-world
	cases for many languages. </p>
	<h3>Text to Speech</h3>
	<p style='text-align: justify'> The TTS model used is from [https://huggingface.co/facebook/tts_transformer-es-css10].
	This model uses the <b>Fairseq(-py)</b> sequence modeling toolkit for speech synthesis, in this case, specifically TTS
	for Spanish. More information can be seen on their git at
	[https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis]. </p>
	"""


	ui.launch(inbrowser=True)