Yusin's picture
Update app.py
ce803ce
raw history blame
No virus
7.28 kB
import gradio as gr
from PIL import Image
import os
#from diffusers import StableDiffusionPipeline
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
title="Talking to Stable Diffusion"
### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
def get_images(prompt):
gallery_dir = stable_diffusion(prompt, fn_index=2)
return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
whisper_results = translate_better(audio)
prompt = whisper_results[1]
images = get_images(prompt)
return whisper_results[0], whisper_results[1], images
def translate_better(audio):
print("""
β€”
Sending audio to Whisper ...
β€”
""")
transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
translate_text_result = whisper(audio, None, "translate", fn_index=0)
print("transcript: " + transcribe_text_result)
print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
print("translated: " + translate_text_result)
return transcribe_text_result, translate_text_result
with gr.Blocks() as demo:
gr.Markdown(
"""
## 1. Say what you want:
"""
)
with gr.Column():
with gr.Tab(label="Record audio input", elem_id="record_tab"):
with gr.Column():
record_input = gr.Audio(
source="microphone",
type="filepath",
show_label=False,
elem_id="record_btn"
)
with gr.Row():
audio_r_translate = gr.Button("Check Whisper first ? πŸ‘", elem_id="check_btn_1")
audio_r_direct_sd = gr.Button("Magic Whisper β€Ί SD right now!", elem_id="magic_btn_1")
with gr.Tab(label="Upload audio input", elem_id="upload_tab"):
with gr.Column():
upload_input = gr.Audio(
source="upload",
type="filepath",
show_label=False,
elem_id="upload_area"
)
with gr.Row():
audio_u_translate = gr.Button("Check Whisper first ? πŸ‘", elem_id="check_btn_2")
audio_u_direct_sd = gr.Button("Magic Whisper β€Ί SD right now!", elem_id="magic_btn_2")
with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
with gr.Row():
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps')
seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True)
gr.Markdown(
"""
## 2. Check Whisper output, correct it if necessary:
"""
)
with gr.Row():
transcripted_output = gr.Textbox(
label="Transcription in your detected spoken language",
lines=3,
elem_id="transcripted"
)
#language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
with gr.Column():
translated_output = gr.Textbox(
label="Transcript translated in English by Whisper",
lines=4,
elem_id="translated"
)
with gr.Row():
clear_btn = gr.Button(value="Clear")
diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn")
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
gr.Markdown("""
## 3. Wait for Stable Diffusion Results β˜•οΈ
Inference time is about ~10 seconds, when it's your turn 😬
"""
)
sd_output = gr.Gallery().style(grid=2, height="auto")
audio_r_translate.click(translate_better,
inputs = record_input,
outputs = [
#language_detected_output,
transcripted_output,
translated_output
])
audio_u_translate.click(translate_better,
inputs = upload_input,
outputs = [
#language_detected_output,
transcripted_output,
translated_output
])
audio_r_direct_sd.click(magic_whisper_to_sd,
inputs = [
record_input,
guidance_scale,
nb_iterations,
seed
],
outputs = [
#language_detected_output,
transcripted_output,
translated_output,
sd_output
])
audio_u_direct_sd.click(magic_whisper_to_sd,
inputs = [
upload_input,
guidance_scale,
nb_iterations,
seed
],
outputs = [
#language_detected_output,
transcripted_output,
translated_output,
sd_output
])
diffuse_btn.click(get_images,
inputs = [
translated_output
],
outputs = sd_output
)
if __name__ == "__main__":
demo.queue(max_size=32, concurrency_count=20).launch()