Spaces:
Running
Running
import os | |
import numpy as np | |
import gradio as gr | |
import assemblyai as aai | |
from translate import Translator | |
import uuid | |
from elevenlabs import VoiceSettings | |
from elevenlabs.client import ElevenLabs | |
from pathlib import Path | |
ELEVENLABS_API = os.environ.get("ELEVENLABS_API") | |
ASSEMBLYAI_API = os.environ.get("ASSEMBLYAI_API") | |
def voice_to_voice(audio_file): | |
transcript = transcribe_audio(audio_file) | |
if transcript.status == aai.TranscriptStatus.error: | |
raise gr.Error(transcript.error) | |
else: | |
transcript = transcript.text | |
list_translations = translate_text(transcript) | |
generated_audio_paths = [] | |
for translation in list_translations: | |
translated_audio_file_name = text_to_speech(translation) | |
path = Path(translated_audio_file_name) | |
generated_audio_paths.append(path) | |
return tuple(generated_audio_paths + list_translations) | |
def transcribe_audio(audio_file): | |
aai.settings.api_key = ELEVENLABS_API | |
transcriber = aai.Transcriber() | |
transcript = transcriber.transcribe(audio_file) | |
return transcript | |
def translate_text(text): | |
languages = ["ru", "tr", "sv", "de", "es", "ja", "id"] | |
list_translations = [] | |
for lan in languages: | |
translator = Translator(from_lang="en", to_lang=lan) | |
translation = translator.translate(text) | |
list_translations.append(translation) | |
return list_translations | |
def text_to_speech(text): | |
client = ElevenLabs(api_key=ELEVENLABS_API) | |
response = client.text_to_speech.convert( | |
voice_id="<your-voice-id>", | |
optimize_streaming_latency="0", | |
output_format="mp3_22050_32", | |
text=text, | |
model_id="eleven_multilingual_v2", | |
voice_settings=VoiceSettings( | |
stability=0.5, | |
similarity_boost=0.8, | |
style=0.5, | |
use_speaker_boost=True, | |
), | |
) | |
save_file_path = f"{uuid.uuid4()}.mp3" | |
with open(save_file_path, "wb") as f: | |
for chunk in response: | |
if chunk: | |
f.write(chunk) | |
return save_file_path | |
with gr.Blocks() as demo: | |
gr.Markdown("## audio Translator") | |
gr.Markdown( | |
f""" | |
The API Key you need: | |
(AssemblyAI API key)[https://www.assemblyai.com/?utm_source=youtube&utm_medium=referral&utm_campaign=yt_mis_66]<br> | |
(Elevenlabs API key)[https://elevenlabs.io/]<br> | |
Note: you need at least 30 minutes of a voice recording of yourself for the *Professional voice cloning. But there is also a simpler voice cloning option that only requires 30 seconds of voice recording. *Professional voice cloning is a paid feature. | |
""" | |
) | |
audio_input = gr.Audio(type="filepath", show_download_button=True) | |
submit = gr.Button("Submit", variant="primary") | |
clear_button = gr.ClearButton(audio_input, "Clear") | |
output_components = [] | |
languages = ["Turkish", "Swedish", "Russian", "German", "Spanish", "Japanese", "indonesian"] | |
for lang in languages: | |
with gr.Group(): | |
output_components.append(gr.Audio(label=lang, interactive=False)) | |
output_components.append(gr.Markdown()) | |
submit.click(fn=voice_to_voice, inputs=audio_input, outputs=output_components, show_progress=True) | |
if __name__ == "__main__": | |
demo.launch() | |