speech-to-speech-translation

Sleeping

App Files Files Community

speech-to-speech-translation / app.py

Marco-Cheung

Update app.py

bcbcb8f 11 months ago

raw history blame

No virus

2.48 kB

	import gradio as gr
	import numpy as np
	import torch
	from transformers import pipeline, VitsModel, VitsTokenizer

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	target_dtype = np.int16
	max_range = np.iinfo(target_dtype).max

	# load speech translation checkpoint
	ASR_MODEL_NAME = 'openai/whisper-base'
	asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, device=device)


	# load text-to-speech checkpoint
	model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
	tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")


	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "de"})
	return outputs["text"]

	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]

	with torch.no_grad():
	outputs = model(input_ids)

	speech = outputs.audio[0]
	return speech.cpu()

	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
	return 16000, synthesised_speech


	title = "Cascaded STST - Any language to German speech"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
	[MMS TTS](https://huggingface.co/Matthijs/mms-tts-deu) model for text-to-speech:
	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""
	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.queue(concurrency_count=2,max_size=10)
	demo.launch()