speech-to-speech-translation

Runtime error

wilton

updating of functions, new MMS model for spanish TTS support

7d1dd6b 11 months ago

2.39 kB

	import gradio as gr
	import numpy as np
	import torch
	from datasets import load_dataset
	from transformers import pipeline, VitsModel, AutoTokenizer

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)

	# load facebook mms espanish model/checkpoint
	model = VitsModel.from_pretrained("facebook/mms-tts-spa")
	tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")


	target_dtype = np.int16
	max_range = np.iinfo(target_dtype).max


	def translate(audio):
	outputs = asr_pipe(
	audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"}
	)
	return outputs["text"]


	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	speech = model(**inputs).waveform
	return speech.squeeze(0).cpu()


	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
	return 16_000, synthesised_speech


	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish. Demo uses OpenAI's [Whisper Small](https://huggingface.co/openai/whisper-small) model for speech translation, and Meta's
	[MMS TTS Spanish](https://huggingface.co/facebook/mms-tts-spa) model for text-to-speech:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech :)", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch(debug=True)