import gradio as gr from PIL import Image import os #from diffusers import StableDiffusionPipeline whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2") stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5") ### ———————————————————————————————————————— title="Talking to Stable Diffusion" ### ———————————————————————————————————————— def get_images(prompt): gallery_dir = stable_diffusion(prompt, fn_index=2) return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)] def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed): whisper_results = translate_better(audio) prompt = whisper_results[1] images = get_images(prompt) return whisper_results[0], whisper_results[1], images def translate_better(audio): print(""" — Sending audio to Whisper ... — """) transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0) translate_text_result = whisper(audio, None, "translate", fn_index=0) print("transcript: " + transcribe_text_result) print("———————————————————————————————————————————") print("translated: " + translate_text_result) return transcribe_text_result, translate_text_result with gr.Blocks() as demo: gr.Markdown( """ ## 1. Say what you want: """ ) with gr.Column(): with gr.Tab(label="Record audio input", elem_id="record_tab"): with gr.Column(): record_input = gr.Audio( source="microphone", type="filepath", show_label=False, elem_id="record_btn" ) with gr.Row(): audio_r_translate = gr.Button("Check Whisper first ? 👍", elem_id="check_btn_1") audio_r_direct_sd = gr.Button("Magic Whisper › SD right now!", elem_id="magic_btn_1") with gr.Tab(label="Upload audio input", elem_id="upload_tab"): with gr.Column(): upload_input = gr.Audio( source="upload", type="filepath", show_label=False, elem_id="upload_area" ) with gr.Row(): audio_u_translate = gr.Button("Check Whisper first ? 👍", elem_id="check_btn_2") audio_u_direct_sd = gr.Button("Magic Whisper › SD right now!", elem_id="magic_btn_2") with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False): with gr.Row(): guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale') nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps') seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True) gr.Markdown( """ ## 2. Check Whisper output, correct it if necessary: """ ) with gr.Row(): transcripted_output = gr.Textbox( label="Transcription in your detected spoken language", lines=3, elem_id="transcripted" ) #language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3) with gr.Column(): translated_output = gr.Textbox( label="Transcript translated in English by Whisper", lines=4, elem_id="translated" ) with gr.Row(): clear_btn = gr.Button(value="Clear") diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn") clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output) gr.Markdown(""" ## 3. Wait for Stable Diffusion Results ☕️ Inference time is about ~10 seconds, when it's your turn 😬 """ ) sd_output = gr.Gallery().style(grid=2, height="auto") audio_r_translate.click(translate_better, inputs = record_input, outputs = [ #language_detected_output, transcripted_output, translated_output ]) audio_u_translate.click(translate_better, inputs = upload_input, outputs = [ #language_detected_output, transcripted_output, translated_output ]) audio_r_direct_sd.click(magic_whisper_to_sd, inputs = [ record_input, guidance_scale, nb_iterations, seed ], outputs = [ #language_detected_output, transcripted_output, translated_output, sd_output ]) audio_u_direct_sd.click(magic_whisper_to_sd, inputs = [ upload_input, guidance_scale, nb_iterations, seed ], outputs = [ #language_detected_output, transcripted_output, translated_output, sd_output ]) diffuse_btn.click(get_images, inputs = [ translated_output ], outputs = sd_output ) if __name__ == "__main__": demo.queue(max_size=32, concurrency_count=20).launch()