alonsosilva's picture
Update app.py
2fc66cc
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
import numpy as np
from elevenlabs import voices, generate, set_api_key, UnauthenticatedRateLimitError
from transformers import MarianMTModel, MarianTokenizer
import openai
import tempfile
from gtts import gTTS
# assert os.getenv("ELEVEN_LABS_API_KEY"), "env variable ELEVEN_LABS_API_KEY must be set"
# from elevenlabs import set_api_key
# set_api_key(ELEVEN_LABS_API_KEY)
to_language_dict = whisper.tokenizer.LANGUAGES
to_language_code_dict = whisper.tokenizer.TO_LANGUAGE_CODE
language_list = list(to_language_code_dict.keys())
language_list = [language.capitalize() for language in language_list]
model = whisper.load_model("small")
def inference(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
input_language = max(probs, key=probs.get)
result = whisper.transcribe(audio=audio, model=model,language=input_language, fp16=False, verbose=False)
return result['text'], to_language_dict[input_language].capitalize()
def decapitalize(s):
if not s: # check that s is not empty string
return s
return s[0].lower() + s[1:]
def translate(text, input_language, output_language):
if input_language != output_language:
response = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
messages = [
{"role": "system", "content": "You are a professional translator."},
{"role": "user", "content": f"Translate from {input_language} to {output_language} the following phrase:'{text}'. Remove the pronunciation and comments."}
]
)
text = response["choices"][0]["message"]["content"]
translated_text = text
return translated_text, output_language
def tts(text: str, language: str):
language = decapitalize(language)
language = to_language_code_dict[language]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
print(fp.name)
tts = gTTS(text, lang=language)
# tts.save(fp)
tts.save("./hello.mp3")
return "./hello.mp3"
def generate_audio(translated_text):
try:
out_audio = generate_voice(text=translated_text, voice_name="Sam", model_name="eleven_multilingual_v1")
return out_audio
except Exception as e:
raise gr.Error(e)
def pad_buffer(audio):
# Pad buffer to multiple of 2 bytes
buffer_size = len(audio)
element_size = np.dtype(np.int16).itemsize
if buffer_size % element_size != 0:
audio = audio + b'\0' * (element_size - (buffer_size % element_size))
return audio
def generate_voice(text, voice_name, model_name):
try:
audio = generate(
text[:250], # Limit to 250 characters
voice=voice_name,
model=model_name
)
return (44100, np.frombuffer(pad_buffer(audio), dtype=np.int16))
except UnauthenticatedRateLimitError as e:
raise gr.Error("Thanks for trying out ElevenLabs TTS! You've reached the free tier limit. Please provide an API key to continue.")
except Exception as e:
raise gr.Error(e)
css = """
.gradio-container {
font-family: 'IBM Plex Sans', sans-serif;
}
.gr-button {
color: white;
border-color: black;
background: black;
}
input[type='range'] {
accent-color: black;
}
.dark input[type='range'] {
accent-color: #dfdfdf;
}
.container {
max-width: 730px;
margin: auto;
padding-top: 1.5rem;
}
.details:hover {
text-decoration: underline;
}
.gr-button {
white-space: nowrap;
}
.gr-button:focus {
border-color: rgb(147 197 253 / var(--tw-border-opacity));
outline: none;
box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
--tw-border-opacity: 1;
--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
--tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
--tw-ring-opacity: .5;
}
.footer {
margin-bottom: 45px;
margin-top: 35px;
text-align: center;
border-bottom: 1px solid #e5e5e5;
}
.footer>p {
font-size: .8rem;
display: inline-block;
padding: 0 10px;
transform: translateY(10px);
background: white;
}
.dark .footer {
border-color: #303030;
}
.dark .footer>p {
background: #0b0f19;
}
.prompt h4{
margin: 1.25em 0 .25em 0;
font-weight: bold;
font-size: 115%;
}
.animate-spin {
animation: spin 1s linear infinite;
}
@keyframes spin {
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
"""
block = gr.Blocks(css=css)
with block:
gr.HTML(
"""
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px;">
Speech2Speech Translator
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Speech2Speech Translator is the composition of two models: speech2text model Whisper by OpenAI and text2speech model Eleven Multilingual by ElevenLabs. This demo cuts audio after around 30 secs. This demo doesn't work in Firefox. Try it on Chrome or Edge.
</p>
</div>
"""
)
with gr.Group():
with gr.Box():
with gr.Row().style(mobile_collapse=False, equal_height=True):
audio = gr.Audio(
label="Input Audio",
show_label=False,
source="microphone",
type="filepath"
)
btn = gr.Button("Transcribe")
with gr.Box():
with gr.Row().style(mobile_collapse=False, equal_height=True):
text = gr.Textbox(label="Transcribed text:", elem_id="result-textarea")
input_language = gr.Textbox(label="Language detected:", elem_id="result-textarea")
with gr.Box():
with gr.Row().style(equal_height=True):
output_language = gr.Dropdown(
language_list,
label="Output Language",
value="French",
elem_id="output_language"
)
btn_translate = gr.Button("Translate")
with gr.Box():
with gr.Row().style(equal_height=True):
translated_text = gr.Textbox(label="Translated text:", elem_id="result-textarea")
btn_generate = gr.Button("Generate Voice")
out_audio = gr.Audio(
label="Generated Voice",
type="numpy",
elem_id="out_audio"
)
inputs = [audio]
outputs = [text, input_language]
btn.click(inference, inputs=inputs, outputs=outputs)
inputs = [text, input_language, output_language]
outputs = [translated_text, output_language]
btn_translate.click(translate, inputs=inputs, outputs=outputs)
inputs = [translated_text, output_language]
outputs = [out_audio]
btn_generate.click(tts, inputs=inputs, outputs=outputs)
gr.HTML('''
<div class="footer">
<p>TTS Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - STT Model by <a href="https://beta.elevenlabs.io/" style="text-decoration:underline;" target="_blank">ElevenLabs</a> - Demo by <a href="https://twitter.com/alonsosilva" style="text-decoration: underline;" target="_blank">alonsosilva</a>
</p>
</div>
''')
block.launch()