Spaces:

translation
/

colab

No application file

App Files Files Community

colab / app (2).py

translation

Upload app (2).py

3b593b1 verified about 1 year ago

raw

history blame contribute delete

3.85 kB

	# -- coding: utf-8 --
	"""Untitled2.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1dwiOTRmj8MDuVOgv2OKzE7qx3UXPKCQH
	"""

	!pip install gradio git+https://github.com/huggingface/transformers.git sentencepiece torchaudio

	# Install necessary packages
	# !pip install gradio git+https://github.com/huggingface/transformers.git sentencepiece torchaudio

	import gradio as gr
	from transformers import AutoProcessor, SeamlessM4Tv2Model
	import torchaudio
	import numpy as np

	# Load the processor and model
	processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
	model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
	sample_rate = model.config.sampling_rate

	# Text-to-Speech function
	def text_to_speech(text, src_lang="eng", tgt_lang="arb"):
	text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
	audio_array_from_text = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
	return sample_rate, audio_array_from_text

	# Speech-to-Speech function
	def speech_to_speech(audio, src_lang="eng", tgt_lang="rus"):
	audio, orig_freq = torchaudio.load(audio)
	audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # Must be a 16 kHz waveform array
	audio_inputs = processor(audios=audio, return_tensors="pt")
	audio_array_from_audio = model.generate(**audio_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
	return sample_rate, audio_array_from_audio

	# Speech-to-Text function
	def speech_to_text(audio, src_lang="eng", tgt_lang="ces"):
	audio, orig_freq = torchaudio.load(audio)
	audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # Must be a 16 kHz waveform array
	audio_inputs = processor(audios=audio, return_tensors="pt")
	output_tokens = model.generate(**audio_inputs, tgt_lang=tgt_lang, generate_speech=False)
	translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
	return translated_text_from_audio

	# Text-to-Text function
	def text_to_text(text, src_lang="eng", tgt_lang="ces"):
	text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
	output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False)
	translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
	return translated_text_from_text

	# Create Gradio interfaces
	text_to_speech_interface = gr.Interface(
	fn=text_to_speech,
	inputs=[gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="arb")],
	outputs=[gr.Audio(label="Output Audio")]
	)

	speech_to_speech_interface = gr.Interface(
	fn=speech_to_speech,
	inputs=[gr.Audio(type="filepath"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="rus")],
	outputs=[gr.Audio(label="Output Audio")]
	)

	speech_to_text_interface = gr.Interface(
	fn=speech_to_text,
	inputs=[gr.Audio(type="filepath"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="ces")],
	outputs=gr.Textbox(label="Translated Text")
	)

	text_to_text_interface = gr.Interface(
	fn=text_to_text,
	inputs=[gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="ces")],
	outputs=gr.Textbox(label="Translated Text")
	)

	# Combine all interfaces into a single tabbed interface
	app = gr.TabbedInterface(
	[text_to_speech_interface, speech_to_speech_interface, speech_to_text_interface, text_to_text_interface],
	["Text-to-Speech", "Speech-to-Speech", "Speech-to-Text", "Text-to-Text"]
	)

	# Launch the app
	app.launch()