speech-to-speech-translation

Runtime error

App Files Files Community

speech-to-speech-translation / app.py

preetam8

Update app.py

113d3f1 verified about 1 year ago

raw

history blame contribute delete

3.11 kB

	import gradio as gr
	import logging
	import numpy as np
	import torch

	from transformers import VitsModel, VitsTokenizer, pipeline
	from transformers import M2M100ForConditionalGeneration
	from tokenization_small100 import SMALL100Tokenizer


	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	target_language = "fr"

	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-small-cv11-french", device=device)
	translation_model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
	translation_tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100", tgt_lang=target_language)

	# load text-to-speech checkpoint
	model = VitsModel.from_pretrained("facebook/mms-tts-fra")
	tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
	eng_text = outputs["text"]
	encoded_eng_text = translation_tokenizer(eng_text, return_tensors="pt")
	generated_tokens = translation_model.generate(**encoded_eng_text)
	translated_text = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
	logging.info(f"Translated Text: {translated_text}")
	return translated_text


	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	outputs = model(inputs["input_ids"])
	speech = outputs["waveform"][0]
	logging.info(speech)
	return speech.cpu()


	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech


	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for ASR, the
	[SMaLL-100](https://huggingface.co/alirezamsh/small100) model for text to text translation and Facebook's [MMS TTS-FRA](https://huggingface.co/facebook/mms-tts-fra) for text-to-speech for french:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(sources=["microphone"], type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(sources=["upload"], type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	logging.getLogger().setLevel(logging.INFO)
	demo.launch()