Yusin's picture
Update app.py
d25de88
import gradio as gr
from PIL import Image
import os
#from diffusers import StableDiffusionPipeline
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
#stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
title="Talking to Stable Diffusion"
### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
def get_images(prompt):
#gallery_dir = stable_diffusion(prompt, None, None, fn_index=2)
gallery_dir = stable_diffusion(prompt, fn_index=2)
return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
def translate_better(audio):
print("""
β€”
Sending audio to Whisper ...
β€”
""")
transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
translate_text_result = whisper(audio, None, "translate", fn_index=0)
print("transcript: " + transcribe_text_result)
print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
print("translated: " + translate_text_result)
return transcribe_text_result, translate_text_result
with gr.Blocks() as demo:
gr.Markdown(
"""
## 1. Say what you want:
"""
)
with gr.Column():
with gr.Tab(label="Record audio input", elem_id="record_tab"):
with gr.Column():
record_input = gr.Audio(
source="microphone",
type="filepath",
show_label=False,
elem_id="record_btn"
)
with gr.Row():
audio_r_translate = gr.Button("Check Whisper first", elem_id="check_btn_1")
audio_r_direct_sd = gr.Button("Generating Images", elem_id="magic_btn_1")
with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
with gr.Row():
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps')
seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True)
gr.Markdown(
"""
## 2. Check Whisper output:
"""
)
with gr.Row():
transcripted_output = gr.Textbox(
label="Transcription in your detected spoken language",
lines=3,
elem_id="transcripted"
)
translated_output = gr.Textbox(
label="Transcription in your detected spoken language",
lines=3,
elem_id="translated"
)
gr.Markdown("""
## 3. Wait for Stable Diffusion Results about ~10 seconds
"""
)
sd_output = gr.Gallery().style(grid=2, height="auto")
audio_r_translate.click(translate_better,
inputs = [
record_input
],
outputs = [
transcripted_output,
translated_output,
])
audio_r_direct_sd.click(get_images,
inputs = [
translated_output
],
outputs = sd_output
)
if __name__ == "__main__":
demo.queue(max_size=32, concurrency_count=20).launch()