|
import os |
|
os.system("pip install git+https://github.com/openai/whisper.git") |
|
import gradio as gr |
|
import whisper |
|
import numpy as np |
|
from elevenlabs import voices, generate, set_api_key, UnauthenticatedRateLimitError |
|
from transformers import MarianMTModel, MarianTokenizer |
|
import openai |
|
|
|
import tempfile |
|
from gtts import gTTS |
|
|
|
|
|
|
|
|
|
|
|
to_language_dict = whisper.tokenizer.LANGUAGES |
|
to_language_code_dict = whisper.tokenizer.TO_LANGUAGE_CODE |
|
language_list = list(to_language_code_dict.keys()) |
|
language_list = [language.capitalize() for language in language_list] |
|
|
|
model = whisper.load_model("small") |
|
|
|
def inference(audio): |
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
_, probs = model.detect_language(mel) |
|
input_language = max(probs, key=probs.get) |
|
result = whisper.transcribe(audio=audio, model=model,language=input_language, fp16=False, verbose=False) |
|
return result['text'], to_language_dict[input_language].capitalize() |
|
|
|
def decapitalize(s): |
|
if not s: |
|
return s |
|
return s[0].lower() + s[1:] |
|
|
|
def translate(text, input_language, output_language): |
|
if input_language != output_language: |
|
response = openai.ChatCompletion.create( |
|
model = "gpt-3.5-turbo", |
|
messages = [ |
|
{"role": "system", "content": "You are a professional translator."}, |
|
{"role": "user", "content": f"Translate from {input_language} to {output_language} the following phrase:'{text}'. Remove the pronunciation and comments."} |
|
] |
|
) |
|
text = response["choices"][0]["message"]["content"] |
|
translated_text = text |
|
return translated_text, output_language |
|
|
|
def tts(text: str, language: str): |
|
language = decapitalize(language) |
|
language = to_language_code_dict[language] |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
print(fp.name) |
|
tts = gTTS(text, lang=language) |
|
|
|
tts.save("./hello.mp3") |
|
return "./hello.mp3" |
|
|
|
def generate_audio(translated_text): |
|
try: |
|
out_audio = generate_voice(text=translated_text, voice_name="Sam", model_name="eleven_multilingual_v1") |
|
return out_audio |
|
except Exception as e: |
|
raise gr.Error(e) |
|
|
|
def pad_buffer(audio): |
|
|
|
buffer_size = len(audio) |
|
element_size = np.dtype(np.int16).itemsize |
|
if buffer_size % element_size != 0: |
|
audio = audio + b'\0' * (element_size - (buffer_size % element_size)) |
|
return audio |
|
|
|
def generate_voice(text, voice_name, model_name): |
|
try: |
|
audio = generate( |
|
text[:250], |
|
voice=voice_name, |
|
model=model_name |
|
) |
|
return (44100, np.frombuffer(pad_buffer(audio), dtype=np.int16)) |
|
except UnauthenticatedRateLimitError as e: |
|
raise gr.Error("Thanks for trying out ElevenLabs TTS! You've reached the free tier limit. Please provide an API key to continue.") |
|
except Exception as e: |
|
raise gr.Error(e) |
|
|
|
css = """ |
|
.gradio-container { |
|
font-family: 'IBM Plex Sans', sans-serif; |
|
} |
|
.gr-button { |
|
color: white; |
|
border-color: black; |
|
background: black; |
|
} |
|
input[type='range'] { |
|
accent-color: black; |
|
} |
|
.dark input[type='range'] { |
|
accent-color: #dfdfdf; |
|
} |
|
.container { |
|
max-width: 730px; |
|
margin: auto; |
|
padding-top: 1.5rem; |
|
} |
|
|
|
.details:hover { |
|
text-decoration: underline; |
|
} |
|
.gr-button { |
|
white-space: nowrap; |
|
} |
|
.gr-button:focus { |
|
border-color: rgb(147 197 253 / var(--tw-border-opacity)); |
|
outline: none; |
|
box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); |
|
--tw-border-opacity: 1; |
|
--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); |
|
--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); |
|
--tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); |
|
--tw-ring-opacity: .5; |
|
} |
|
.footer { |
|
margin-bottom: 45px; |
|
margin-top: 35px; |
|
text-align: center; |
|
border-bottom: 1px solid #e5e5e5; |
|
} |
|
.footer>p { |
|
font-size: .8rem; |
|
display: inline-block; |
|
padding: 0 10px; |
|
transform: translateY(10px); |
|
background: white; |
|
} |
|
.dark .footer { |
|
border-color: #303030; |
|
} |
|
.dark .footer>p { |
|
background: #0b0f19; |
|
} |
|
.prompt h4{ |
|
margin: 1.25em 0 .25em 0; |
|
font-weight: bold; |
|
font-size: 115%; |
|
} |
|
.animate-spin { |
|
animation: spin 1s linear infinite; |
|
} |
|
@keyframes spin { |
|
from { |
|
transform: rotate(0deg); |
|
} |
|
to { |
|
transform: rotate(360deg); |
|
} |
|
} |
|
""" |
|
|
|
block = gr.Blocks(css=css) |
|
|
|
with block: |
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 650px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px;"> |
|
Speech2Speech Translator |
|
</h1> |
|
</div> |
|
<p style="margin-bottom: 10px; font-size: 94%"> |
|
Speech2Speech Translator is the composition of two models: speech2text model Whisper by OpenAI and text2speech model Eleven Multilingual by ElevenLabs. This demo cuts audio after around 30 secs. This demo doesn't work in Firefox. Try it on Chrome or Edge. |
|
</p> |
|
</div> |
|
""" |
|
) |
|
with gr.Group(): |
|
with gr.Box(): |
|
with gr.Row().style(mobile_collapse=False, equal_height=True): |
|
audio = gr.Audio( |
|
label="Input Audio", |
|
show_label=False, |
|
source="microphone", |
|
type="filepath" |
|
) |
|
btn = gr.Button("Transcribe") |
|
|
|
with gr.Box(): |
|
with gr.Row().style(mobile_collapse=False, equal_height=True): |
|
text = gr.Textbox(label="Transcribed text:", elem_id="result-textarea") |
|
input_language = gr.Textbox(label="Language detected:", elem_id="result-textarea") |
|
|
|
with gr.Box(): |
|
with gr.Row().style(equal_height=True): |
|
output_language = gr.Dropdown( |
|
language_list, |
|
label="Output Language", |
|
value="French", |
|
elem_id="output_language" |
|
) |
|
btn_translate = gr.Button("Translate") |
|
|
|
with gr.Box(): |
|
with gr.Row().style(equal_height=True): |
|
translated_text = gr.Textbox(label="Translated text:", elem_id="result-textarea") |
|
btn_generate = gr.Button("Generate Voice") |
|
|
|
|
|
out_audio = gr.Audio( |
|
label="Generated Voice", |
|
type="numpy", |
|
elem_id="out_audio" |
|
) |
|
|
|
inputs = [audio] |
|
outputs = [text, input_language] |
|
btn.click(inference, inputs=inputs, outputs=outputs) |
|
|
|
inputs = [text, input_language, output_language] |
|
outputs = [translated_text, output_language] |
|
btn_translate.click(translate, inputs=inputs, outputs=outputs) |
|
|
|
inputs = [translated_text, output_language] |
|
outputs = [out_audio] |
|
btn_generate.click(tts, inputs=inputs, outputs=outputs) |
|
|
|
gr.HTML(''' |
|
<div class="footer"> |
|
<p>TTS Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - STT Model by <a href="https://beta.elevenlabs.io/" style="text-decoration:underline;" target="_blank">ElevenLabs</a> - Demo by <a href="https://twitter.com/alonsosilva" style="text-decoration: underline;" target="_blank">alonsosilva</a> |
|
</p> |
|
</div> |
|
''') |
|
|
|
block.launch() |
|
|