Spaces:
Runtime error
Runtime error
| import os | |
| os.system("pip install git+https://github.com/openai/whisper.git") | |
| import gradio as gr | |
| import whisper | |
| import numpy as np | |
| from elevenlabs import voices, generate, set_api_key, UnauthenticatedRateLimitError | |
| from transformers import MarianMTModel, MarianTokenizer | |
| import openai | |
| import tempfile | |
| from gtts import gTTS | |
| # assert os.getenv("ELEVEN_LABS_API_KEY"), "env variable ELEVEN_LABS_API_KEY must be set" | |
| # from elevenlabs import set_api_key | |
| # set_api_key(ELEVEN_LABS_API_KEY) | |
| to_language_dict = whisper.tokenizer.LANGUAGES | |
| to_language_code_dict = whisper.tokenizer.TO_LANGUAGE_CODE | |
| language_list = list(to_language_code_dict.keys()) | |
| language_list = [language.capitalize() for language in language_list] | |
| model = whisper.load_model("small") | |
| def inference(audio): | |
| audio = whisper.load_audio(audio) | |
| audio = whisper.pad_or_trim(audio) | |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
| _, probs = model.detect_language(mel) | |
| input_language = max(probs, key=probs.get) | |
| result = whisper.transcribe(audio=audio, model=model,language=input_language, fp16=False, verbose=False) | |
| return result['text'], to_language_dict[input_language].capitalize() | |
| def decapitalize(s): | |
| if not s: # check that s is not empty string | |
| return s | |
| return s[0].lower() + s[1:] | |
| def translate(text, input_language, output_language): | |
| if input_language != output_language: | |
| response = openai.ChatCompletion.create( | |
| model = "gpt-3.5-turbo", | |
| messages = [ | |
| {"role": "system", "content": "You are a professional translator."}, | |
| {"role": "user", "content": f"Translate from {input_language} to {output_language} the following phrase:'{text}'. Remove the pronunciation and comments."} | |
| ] | |
| ) | |
| text = response["choices"][0]["message"]["content"] | |
| translated_text = text | |
| return translated_text, output_language | |
| def tts(text: str, language: str): | |
| language = decapitalize(language) | |
| language = to_language_code_dict[language] | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
| print(fp.name) | |
| tts = gTTS(text, lang=language) | |
| # tts.save(fp) | |
| tts.save("./hello.mp3") | |
| return "./hello.mp3" | |
| def generate_audio(translated_text): | |
| try: | |
| out_audio = generate_voice(text=translated_text, voice_name="Sam", model_name="eleven_multilingual_v1") | |
| return out_audio | |
| except Exception as e: | |
| raise gr.Error(e) | |
| def pad_buffer(audio): | |
| # Pad buffer to multiple of 2 bytes | |
| buffer_size = len(audio) | |
| element_size = np.dtype(np.int16).itemsize | |
| if buffer_size % element_size != 0: | |
| audio = audio + b'\0' * (element_size - (buffer_size % element_size)) | |
| return audio | |
| def generate_voice(text, voice_name, model_name): | |
| try: | |
| audio = generate( | |
| text[:250], # Limit to 250 characters | |
| voice=voice_name, | |
| model=model_name | |
| ) | |
| return (44100, np.frombuffer(pad_buffer(audio), dtype=np.int16)) | |
| except UnauthenticatedRateLimitError as e: | |
| raise gr.Error("Thanks for trying out ElevenLabs TTS! You've reached the free tier limit. Please provide an API key to continue.") | |
| except Exception as e: | |
| raise gr.Error(e) | |
| css = """ | |
| .gradio-container { | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| } | |
| .gr-button { | |
| color: white; | |
| border-color: black; | |
| background: black; | |
| } | |
| input[type='range'] { | |
| accent-color: black; | |
| } | |
| .dark input[type='range'] { | |
| accent-color: #dfdfdf; | |
| } | |
| .container { | |
| max-width: 730px; | |
| margin: auto; | |
| padding-top: 1.5rem; | |
| } | |
| .details:hover { | |
| text-decoration: underline; | |
| } | |
| .gr-button { | |
| white-space: nowrap; | |
| } | |
| .gr-button:focus { | |
| border-color: rgb(147 197 253 / var(--tw-border-opacity)); | |
| outline: none; | |
| box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); | |
| --tw-border-opacity: 1; | |
| --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); | |
| --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); | |
| --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); | |
| --tw-ring-opacity: .5; | |
| } | |
| .footer { | |
| margin-bottom: 45px; | |
| margin-top: 35px; | |
| text-align: center; | |
| border-bottom: 1px solid #e5e5e5; | |
| } | |
| .footer>p { | |
| font-size: .8rem; | |
| display: inline-block; | |
| padding: 0 10px; | |
| transform: translateY(10px); | |
| background: white; | |
| } | |
| .dark .footer { | |
| border-color: #303030; | |
| } | |
| .dark .footer>p { | |
| background: #0b0f19; | |
| } | |
| .prompt h4{ | |
| margin: 1.25em 0 .25em 0; | |
| font-weight: bold; | |
| font-size: 115%; | |
| } | |
| .animate-spin { | |
| animation: spin 1s linear infinite; | |
| } | |
| @keyframes spin { | |
| from { | |
| transform: rotate(0deg); | |
| } | |
| to { | |
| transform: rotate(360deg); | |
| } | |
| } | |
| """ | |
| block = gr.Blocks(css=css) | |
| with block: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 650px; margin: 0 auto;"> | |
| <div | |
| style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.8rem; | |
| font-size: 1.75rem; | |
| " | |
| > | |
| <h1 style="font-weight: 900; margin-bottom: 7px;"> | |
| Speech2Speech Translator | |
| </h1> | |
| </div> | |
| <p style="margin-bottom: 10px; font-size: 94%"> | |
| Speech2Speech Translator is the composition of two models: speech2text model Whisper by OpenAI and text2speech model Eleven Multilingual by ElevenLabs. This demo cuts audio after around 30 secs. This demo doesn't work in Firefox. Try it on Chrome or Edge. | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Group(): | |
| with gr.Box(): | |
| with gr.Row().style(mobile_collapse=False, equal_height=True): | |
| audio = gr.Audio( | |
| label="Input Audio", | |
| show_label=False, | |
| source="microphone", | |
| type="filepath" | |
| ) | |
| btn = gr.Button("Transcribe") | |
| with gr.Box(): | |
| with gr.Row().style(mobile_collapse=False, equal_height=True): | |
| text = gr.Textbox(label="Transcribed text:", elem_id="result-textarea") | |
| input_language = gr.Textbox(label="Language detected:", elem_id="result-textarea") | |
| with gr.Box(): | |
| with gr.Row().style(equal_height=True): | |
| output_language = gr.Dropdown( | |
| language_list, | |
| label="Output Language", | |
| value="French", | |
| elem_id="output_language" | |
| ) | |
| btn_translate = gr.Button("Translate") | |
| with gr.Box(): | |
| with gr.Row().style(equal_height=True): | |
| translated_text = gr.Textbox(label="Translated text:", elem_id="result-textarea") | |
| btn_generate = gr.Button("Generate Voice") | |
| out_audio = gr.Audio( | |
| label="Generated Voice", | |
| type="numpy", | |
| elem_id="out_audio" | |
| ) | |
| inputs = [audio] | |
| outputs = [text, input_language] | |
| btn.click(inference, inputs=inputs, outputs=outputs) | |
| inputs = [text, input_language, output_language] | |
| outputs = [translated_text, output_language] | |
| btn_translate.click(translate, inputs=inputs, outputs=outputs) | |
| inputs = [translated_text, output_language] | |
| outputs = [out_audio] | |
| btn_generate.click(tts, inputs=inputs, outputs=outputs) | |
| gr.HTML(''' | |
| <div class="footer"> | |
| <p>TTS Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - STT Model by <a href="https://beta.elevenlabs.io/" style="text-decoration:underline;" target="_blank">ElevenLabs</a> - Demo by <a href="https://twitter.com/alonsosilva" style="text-decoration: underline;" target="_blank">alonsosilva</a> | |
| </p> | |
| </div> | |
| ''') | |
| block.launch() | |