Spaces:
No application file
No application file
| # -*- coding: utf-8 -*- | |
| """Untitled2.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1dwiOTRmj8MDuVOgv2OKzE7qx3UXPKCQH | |
| """ | |
| !pip install gradio git+https://github.com/huggingface/transformers.git sentencepiece torchaudio | |
| # Install necessary packages | |
| # !pip install gradio git+https://github.com/huggingface/transformers.git sentencepiece torchaudio | |
| import gradio as gr | |
| from transformers import AutoProcessor, SeamlessM4Tv2Model | |
| import torchaudio | |
| import numpy as np | |
| # Load the processor and model | |
| processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") | |
| model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") | |
| sample_rate = model.config.sampling_rate | |
| # Text-to-Speech function | |
| def text_to_speech(text, src_lang="eng", tgt_lang="arb"): | |
| text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt") | |
| audio_array_from_text = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze() | |
| return sample_rate, audio_array_from_text | |
| # Speech-to-Speech function | |
| def speech_to_speech(audio, src_lang="eng", tgt_lang="rus"): | |
| audio, orig_freq = torchaudio.load(audio) | |
| audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # Must be a 16 kHz waveform array | |
| audio_inputs = processor(audios=audio, return_tensors="pt") | |
| audio_array_from_audio = model.generate(**audio_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze() | |
| return sample_rate, audio_array_from_audio | |
| # Speech-to-Text function | |
| def speech_to_text(audio, src_lang="eng", tgt_lang="ces"): | |
| audio, orig_freq = torchaudio.load(audio) | |
| audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # Must be a 16 kHz waveform array | |
| audio_inputs = processor(audios=audio, return_tensors="pt") | |
| output_tokens = model.generate(**audio_inputs, tgt_lang=tgt_lang, generate_speech=False) | |
| translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) | |
| return translated_text_from_audio | |
| # Text-to-Text function | |
| def text_to_text(text, src_lang="eng", tgt_lang="ces"): | |
| text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt") | |
| output_tokens = model.generate(**text_inputs, tgt_lang=tgt_lang, generate_speech=False) | |
| translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) | |
| return translated_text_from_text | |
| # Create Gradio interfaces | |
| text_to_speech_interface = gr.Interface( | |
| fn=text_to_speech, | |
| inputs=[gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="arb")], | |
| outputs=[gr.Audio(label="Output Audio")] | |
| ) | |
| speech_to_speech_interface = gr.Interface( | |
| fn=speech_to_speech, | |
| inputs=[gr.Audio(type="filepath"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="rus")], | |
| outputs=[gr.Audio(label="Output Audio")] | |
| ) | |
| speech_to_text_interface = gr.Interface( | |
| fn=speech_to_text, | |
| inputs=[gr.Audio(type="filepath"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="ces")], | |
| outputs=gr.Textbox(label="Translated Text") | |
| ) | |
| text_to_text_interface = gr.Interface( | |
| fn=text_to_text, | |
| inputs=[gr.Textbox(label="Input Text"), gr.Textbox(label="Source Language", value="eng"), gr.Textbox(label="Target Language", value="ces")], | |
| outputs=gr.Textbox(label="Translated Text") | |
| ) | |
| # Combine all interfaces into a single tabbed interface | |
| app = gr.TabbedInterface( | |
| [text_to_speech_interface, speech_to_speech_interface, speech_to_text_interface, text_to_text_interface], | |
| ["Text-to-Speech", "Speech-to-Speech", "Speech-to-Text", "Text-to-Text"] | |
| ) | |
| # Launch the app | |
| app.launch() | |