Spaces:
Runtime error
Runtime error
import gradio as gr | |
from PIL import Image | |
import os | |
#from diffusers import StableDiffusionPipeline | |
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2") | |
stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion") | |
### ββββββββββββββββββββββββββββββββββββββββ | |
title="Talking to Stable Diffusion" | |
### ββββββββββββββββββββββββββββββββββββββββ | |
def get_images(prompt): | |
gallery_dir = stable_diffusion(prompt, fn_index=2) | |
return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)] | |
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed): | |
whisper_results = translate_better(audio) | |
prompt = whisper_results[1] | |
images = get_images(prompt) | |
return whisper_results[0], whisper_results[1], images | |
def translate_better(audio): | |
print(""" | |
β | |
Sending audio to Whisper ... | |
β | |
""") | |
transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0) | |
translate_text_result = whisper(audio, None, "translate", fn_index=0) | |
print("transcript: " + transcribe_text_result) | |
print("βββββββββββββββββββββββββββββββββββββββββββ") | |
print("translated: " + translate_text_result) | |
return transcribe_text_result, translate_text_result | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
## 1. Say what you want: | |
""" | |
) | |
with gr.Column(): | |
with gr.Tab(label="Record audio input", elem_id="record_tab"): | |
with gr.Column(): | |
record_input = gr.Audio( | |
source="microphone", | |
type="filepath", | |
show_label=False, | |
elem_id="record_btn" | |
) | |
with gr.Row(): | |
audio_r_translate = gr.Button("Check Whisper first ? π", elem_id="check_btn_1") | |
audio_r_direct_sd = gr.Button("Magic Whisper βΊ SD right now!", elem_id="magic_btn_1") | |
with gr.Tab(label="Upload audio input", elem_id="upload_tab"): | |
with gr.Column(): | |
upload_input = gr.Audio( | |
source="upload", | |
type="filepath", | |
show_label=False, | |
elem_id="upload_area" | |
) | |
with gr.Row(): | |
audio_u_translate = gr.Button("Check Whisper first ? π", elem_id="check_btn_2") | |
audio_u_direct_sd = gr.Button("Magic Whisper βΊ SD right now!", elem_id="magic_btn_2") | |
with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False): | |
with gr.Row(): | |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale') | |
nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps') | |
seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True) | |
gr.Markdown( | |
""" | |
## 2. Check Whisper output, correct it if necessary: | |
""" | |
) | |
with gr.Row(): | |
transcripted_output = gr.Textbox( | |
label="Transcription in your detected spoken language", | |
lines=3, | |
elem_id="transcripted" | |
) | |
#language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3) | |
with gr.Column(): | |
translated_output = gr.Textbox( | |
label="Transcript translated in English by Whisper", | |
lines=4, | |
elem_id="translated" | |
) | |
with gr.Row(): | |
clear_btn = gr.Button(value="Clear") | |
diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn") | |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output) | |
gr.Markdown(""" | |
## 3. Wait for Stable Diffusion Results βοΈ | |
Inference time is about ~10 seconds, when it's your turn π¬ | |
""" | |
) | |
sd_output = gr.Gallery().style(grid=2, height="auto") | |
audio_r_translate.click(translate_better, | |
inputs = record_input, | |
outputs = [ | |
#language_detected_output, | |
transcripted_output, | |
translated_output | |
]) | |
audio_u_translate.click(translate_better, | |
inputs = upload_input, | |
outputs = [ | |
#language_detected_output, | |
transcripted_output, | |
translated_output | |
]) | |
audio_r_direct_sd.click(magic_whisper_to_sd, | |
inputs = [ | |
record_input, | |
guidance_scale, | |
nb_iterations, | |
seed | |
], | |
outputs = [ | |
#language_detected_output, | |
transcripted_output, | |
translated_output, | |
sd_output | |
]) | |
audio_u_direct_sd.click(magic_whisper_to_sd, | |
inputs = [ | |
upload_input, | |
guidance_scale, | |
nb_iterations, | |
seed | |
], | |
outputs = [ | |
#language_detected_output, | |
transcripted_output, | |
translated_output, | |
sd_output | |
]) | |
diffuse_btn.click(get_images, | |
inputs = [ | |
translated_output | |
], | |
outputs = sd_output | |
) | |
if __name__ == "__main__": | |
demo.queue(max_size=32, concurrency_count=20).launch() |