import gradio as gr from PIL import Image import os #from diffusers import StableDiffusionPipeline whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2") #stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion") stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5") ### ———————————————————————————————————————— title="Talking to Stable Diffusion" ### ———————————————————————————————————————— def get_images(prompt): #gallery_dir = stable_diffusion(prompt, None, None, fn_index=2) gallery_dir = stable_diffusion(prompt, fn_index=2) return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)] def translate_better(audio): print(""" — Sending audio to Whisper ... — """) transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0) translate_text_result = whisper(audio, None, "translate", fn_index=0) print("transcript: " + transcribe_text_result) print("———————————————————————————————————————————") print("translated: " + translate_text_result) return transcribe_text_result, translate_text_result with gr.Blocks() as demo: gr.Markdown( """ ## 1. Say what you want: """ ) with gr.Column(): with gr.Tab(label="Record audio input", elem_id="record_tab"): with gr.Column(): record_input = gr.Audio( source="microphone", type="filepath", show_label=False, elem_id="record_btn" ) with gr.Row(): audio_r_translate = gr.Button("Check Whisper first", elem_id="check_btn_1") audio_r_direct_sd = gr.Button("Generating Images", elem_id="magic_btn_1") with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False): with gr.Row(): guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale') nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps') seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True) gr.Markdown( """ ## 2. Check Whisper output: """ ) with gr.Row(): transcripted_output = gr.Textbox( label="Transcription in your detected spoken language", lines=3, elem_id="transcripted" ) translated_output = gr.Textbox( label="Transcription in your detected spoken language", lines=3, elem_id="translated" ) gr.Markdown(""" ## 3. Wait for Stable Diffusion Results about ~10 seconds """ ) sd_output = gr.Gallery().style(grid=2, height="auto") audio_r_translate.click(translate_better, inputs = [ record_input ], outputs = [ transcripted_output, translated_output, ]) audio_r_direct_sd.click(get_images, inputs = [ translated_output ], outputs = sd_output ) if __name__ == "__main__": demo.queue(max_size=32, concurrency_count=20).launch()