File size: 3,556 Bytes
22e2a1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64d3e15
22e2a1e
 
 
 
 
 
 
64d3e15
22e2a1e
 
 
64d3e15
22e2a1e
 
 
 
 
 
 
64d3e15
 
 
 
 
22e2a1e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
import whisper
from translate import Translator
import os

os.environ["COQUI_TOS_AGREED"] = "1"

# Loading the base model
model = whisper.load_model("base")


def speech_to_text(audio_file):
    """
    :param audio_file: Input audio file
    :return: result["text"] - The transcribed text after processing the input
    """
    result = model.transcribe(audio_file)
    print(result["text"])
    return result["text"]  # Only first tuple


# Defining the Translate Function
def translate(text, language):
    """

    :param text: Input text to be translated
    :param language: Language choice
    :return: translated_text - The translated text
    """
    translator = Translator(to_lang=language)
    translated_text = translator.translate(text)
    return translated_text

# Initialize TTS model outside the function to avoid reinitialization on each call
from TTS.api import TTS

tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)


# Speech to Speech Function
def s2s(audio, language):
    """

    :param audio:
    :param language:
    :return:
    """
    print(audio)
    # Load the audio file from the file path
    result_text = speech_to_text(audio)
    translated_text = translate(result_text, language)

    # Generate speech using the input audio as the speaker's voice
    tts_model.tts_to_file(text=translated_text,
                          file_path="output.wav",
                          speaker_wav=audio,
                          language=language)

    return [result_text, translated_text, "output.wav"]


# List of supported language codes
language_names = ["Arabic", "Portuguese", "Chinese", "Czech", "Dutch",
                  "English", "French", "German", "Italian", "Polish",
                  "Russian", "Spanish", "Turkish", "Korean",
                  "Hungarian", "Hindi"]
language_options = ["ar", "pt", "zh-cn", "cs", "nl", "en", "fr", "de",
                    "it", "pl", "ru", "es", "tr", "ko", "hu", "hi"]

language_dropdown = gr.Dropdown(choices=zip(language_names, language_options),
                                value="es",
                                label="Target Language",
                                )

translate_button = gr.Button(value="Synthesize and Translate my Voice!")
transcribed_text = gr.Textbox(label="Transcribed Text")
output_text = gr.Textbox(label="Translated Text")
output_speech = gr.Audio(label="Synthesized Audio", type="filepath")

# Gradio interface with the transcribe function as the main function
demo = gr.Interface(
    fn=s2s,
    inputs=[gr.Audio(sources=["upload", "microphone"],
                     type="filepath",
                     format='wav',
                     # value="Original Audio",
                     show_download_button=True,
                     waveform_options=gr.WaveformOptions(
                         waveform_color="#01C6FF",
                         waveform_progress_color="#FF69B4",
                         skip_length=2,
                         show_controls=False,
                     )
                     ),
            language_dropdown],
    outputs=[transcribed_text, output_text, output_speech],
    theme=gr.themes.Soft(),
    title="Speech Translation Synthesis",
    description="This speech-to-speech translator uses the Whisper model for speech-to-text "
                "transcription, the translate library for translation, and the Coqui TTS model for text-to-speech "
                "synthesis.",
    allow_flagging="never"
)

demo.launch(debug=True, share=True)