Spaces:
Runtime error
Runtime error
File size: 3,589 Bytes
5ef621f 1ac7603 5ef621f 1825e0b 5ef621f 1825e0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
os.environ["COQUI_NO_TERMS"] = "1" # Add this line to accept the TOS
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from TTS.api import TTS
# Supported languages for both Whisper and XTTS
languages = {
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Italian": "it",
"Portuguese": "pt",
"Polish": "pl",
"Turkish": "tr",
"Russian": "ru",
"Dutch": "nl",
"Czech": "cs",
"Arabic": "ar",
"Chinese": "zh-cn",
"Japanese": "ja",
"Hungarian": "hu",
"Korean": "ko",
"Hindi": "hi"
}
# Model and Device Configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
whisper_model_id = "openai/whisper-small"
tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # Replace with your actual TTS model
# Load Whisper Model (for transcription and translation)
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
whisper_model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
).to(device)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)
# Load TTS Model (for text-to-speech)
tts = TTS(model_name=tts_model_name, progress_bar=False)
# Translation Pipeline
def create_translate_pipeline(target_language):
return pipeline(
"automatic-speech-recognition",
model=whisper_model,
tokenizer=whisper_processor.tokenizer,
feature_extractor=whisper_processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=1,
torch_dtype=torch_dtype,
device=device,
return_timestamps=True,
generate_kwargs={"task": "transcribe", "language": target_language}
)
# Audio Processing Function
def process_audio(audio_file, translate_language, tts_language):
try:
# Create translation pipeline
translate_pipeline = create_translate_pipeline(translate_language)
# Transcribe and translate
result = translate_pipeline(audio_file)["text"]
# Generate synthesized speech
output_audio_file = "output.wav"
tts.tts_to_file(result, speaker_wav=audio_file, language=tts_language, file_path=output_audio_file)
return result, output_audio_file
except Exception as e:
return f"An error occurred: {e}", None
# Gradio Interface
with gr.Blocks() as interface:
gr.Markdown("# AI VOX LAB POC")
gr.Markdown("Upload/record audio, translate, and get synthesized speech!")
# Add the image here
gr.Image(value="/Users/mac/Desktop/VOX_AI/logo_transparent_background.png", label="App Logo", show_label=False, width=700, height=250)
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
translate_lang = gr.Dropdown(choices=list(languages.keys()), label="Translation Language")
tts_lang = gr.Dropdown(choices=list(languages.values()), label="TTS Synthesis Language")
with gr.Row():
translate_button = gr.Button("Translate and Synthesize")
with gr.Row():
text_output = gr.Textbox(label="Translated Text")
audio_output = gr.Audio(label="Generated Audio")
translate_button.click(
fn=process_audio,
inputs=[audio_input, translate_lang, tts_lang],
outputs=[text_output, audio_output]
)
# Launch the App
if __name__ == "__main__":
interface.launch(share=True)
|