Spaces:
Runtime error
Runtime error
File size: 7,275 Bytes
79d11aa ccbd1ab 79d11aa ccbd1ab 79d11aa ce803ce ccbd1ab 79d11aa ccbd1ab 79d11aa ccbd1ab 79d11aa ccbd1ab 79d11aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import gradio as gr
from PIL import Image
import os
#from diffusers import StableDiffusionPipeline
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
### ββββββββββββββββββββββββββββββββββββββββ
title="Talking to Stable Diffusion"
### ββββββββββββββββββββββββββββββββββββββββ
def get_images(prompt):
gallery_dir = stable_diffusion(prompt, fn_index=2)
return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
whisper_results = translate_better(audio)
prompt = whisper_results[1]
images = get_images(prompt)
return whisper_results[0], whisper_results[1], images
def translate_better(audio):
print("""
β
Sending audio to Whisper ...
β
""")
transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
translate_text_result = whisper(audio, None, "translate", fn_index=0)
print("transcript: " + transcribe_text_result)
print("βββββββββββββββββββββββββββββββββββββββββββ")
print("translated: " + translate_text_result)
return transcribe_text_result, translate_text_result
with gr.Blocks() as demo:
gr.Markdown(
"""
## 1. Say what you want:
"""
)
with gr.Column():
with gr.Tab(label="Record audio input", elem_id="record_tab"):
with gr.Column():
record_input = gr.Audio(
source="microphone",
type="filepath",
show_label=False,
elem_id="record_btn"
)
with gr.Row():
audio_r_translate = gr.Button("Check Whisper first ? π", elem_id="check_btn_1")
audio_r_direct_sd = gr.Button("Magic Whisper βΊ SD right now!", elem_id="magic_btn_1")
with gr.Tab(label="Upload audio input", elem_id="upload_tab"):
with gr.Column():
upload_input = gr.Audio(
source="upload",
type="filepath",
show_label=False,
elem_id="upload_area"
)
with gr.Row():
audio_u_translate = gr.Button("Check Whisper first ? π", elem_id="check_btn_2")
audio_u_direct_sd = gr.Button("Magic Whisper βΊ SD right now!", elem_id="magic_btn_2")
with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
with gr.Row():
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps')
seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True)
gr.Markdown(
"""
## 2. Check Whisper output, correct it if necessary:
"""
)
with gr.Row():
transcripted_output = gr.Textbox(
label="Transcription in your detected spoken language",
lines=3,
elem_id="transcripted"
)
#language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
with gr.Column():
translated_output = gr.Textbox(
label="Transcript translated in English by Whisper",
lines=4,
elem_id="translated"
)
with gr.Row():
clear_btn = gr.Button(value="Clear")
diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn")
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
gr.Markdown("""
## 3. Wait for Stable Diffusion Results βοΈ
Inference time is about ~10 seconds, when it's your turn π¬
"""
)
sd_output = gr.Gallery().style(grid=2, height="auto")
audio_r_translate.click(translate_better,
inputs = record_input,
outputs = [
#language_detected_output,
transcripted_output,
translated_output
])
audio_u_translate.click(translate_better,
inputs = upload_input,
outputs = [
#language_detected_output,
transcripted_output,
translated_output
])
audio_r_direct_sd.click(magic_whisper_to_sd,
inputs = [
record_input,
guidance_scale,
nb_iterations,
seed
],
outputs = [
#language_detected_output,
transcripted_output,
translated_output,
sd_output
])
audio_u_direct_sd.click(magic_whisper_to_sd,
inputs = [
upload_input,
guidance_scale,
nb_iterations,
seed
],
outputs = [
#language_detected_output,
transcripted_output,
translated_output,
sd_output
])
diffuse_btn.click(get_images,
inputs = [
translated_output
],
outputs = sd_output
)
if __name__ == "__main__":
demo.queue(max_size=32, concurrency_count=20).launch() |