File size: 3,589 Bytes
5ef621f
1ac7603
5ef621f
1825e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ef621f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1825e0b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
os.environ["COQUI_NO_TERMS"] = "1"  # Add this line to accept the TOS

import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from TTS.api import TTS

# Supported languages for both Whisper and XTTS
languages = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Polish": "pl",
    "Turkish": "tr",
    "Russian": "ru",
    "Dutch": "nl",
    "Czech": "cs",
    "Arabic": "ar",
    "Chinese": "zh-cn",
    "Japanese": "ja",
    "Hungarian": "hu",
    "Korean": "ko",
    "Hindi": "hi"
}

# Model and Device Configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
whisper_model_id = "openai/whisper-small"
tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"  # Replace with your actual TTS model 

# Load Whisper Model (for transcription and translation)
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    use_safetensors=True
).to(device)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

# Load TTS Model (for text-to-speech)
tts = TTS(model_name=tts_model_name, progress_bar=False)

# Translation Pipeline
def create_translate_pipeline(target_language):
    return pipeline(
        "automatic-speech-recognition",
        model=whisper_model,
        tokenizer=whisper_processor.tokenizer,
        feature_extractor=whisper_processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=1,
        torch_dtype=torch_dtype,
        device=device,
        return_timestamps=True,
        generate_kwargs={"task": "transcribe", "language": target_language}
    )

# Audio Processing Function
def process_audio(audio_file, translate_language, tts_language):
    try:
        # Create translation pipeline
        translate_pipeline = create_translate_pipeline(translate_language)

        # Transcribe and translate
        result = translate_pipeline(audio_file)["text"]

        # Generate synthesized speech
        output_audio_file = "output.wav"
        tts.tts_to_file(result, speaker_wav=audio_file, language=tts_language, file_path=output_audio_file)

        return result, output_audio_file 

    except Exception as e:
        return f"An error occurred: {e}", None

# Gradio Interface
with gr.Blocks() as interface:
    gr.Markdown("# AI VOX LAB POC")
    gr.Markdown("Upload/record audio, translate, and get synthesized speech!")
    
    # Add the image here
    gr.Image(value="/Users/mac/Desktop/VOX_AI/logo_transparent_background.png", label="App Logo", show_label=False, width=700, height=250)
    
    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
        translate_lang = gr.Dropdown(choices=list(languages.keys()), label="Translation Language")
        tts_lang = gr.Dropdown(choices=list(languages.values()), label="TTS Synthesis Language")
    
    with gr.Row():
        translate_button = gr.Button("Translate and Synthesize")
    
    with gr.Row():
        text_output = gr.Textbox(label="Translated Text")
        audio_output = gr.Audio(label="Generated Audio")
    
    translate_button.click(
        fn=process_audio, 
        inputs=[audio_input, translate_lang, tts_lang], 
        outputs=[text_output, audio_output]
    )

# Launch the App
if __name__ == "__main__":
    interface.launch(share=True)