File size: 4,258 Bytes
79d11aa
 
 
 
 
 
d25de88
 
79d11aa
ccbd1ab
79d11aa
 
d25de88
 
79d11aa
 
66322a4
79d11aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce803ce
ccbd1ab
79d11aa
ccbd1ab
79d11aa
 
ccbd1ab
79d11aa
 
 
 
 
 
 
 
 
086eb0b
 
79d11aa
 
 
 
 
 
 
 
 
086eb0b
79d11aa
 
 
 
 
 
 
 
 
 
9f7fe25
 
79d11aa
 
086eb0b
9f7fe25
79d11aa
 
086eb0b
79d11aa
 
 
 
66322a4
79d11aa
66322a4
79d11aa
 
 
 
 
647c4c1
79d11aa
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from PIL import Image
import os

#from diffusers import StableDiffusionPipeline
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
#stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
title="Talking to Stable Diffusion"
### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
def get_images(prompt):
    #gallery_dir = stable_diffusion(prompt, None, None, fn_index=2)
    gallery_dir = stable_diffusion(prompt, fn_index=2)
    return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]


def translate_better(audio):
    print("""
    β€”
    Sending audio to Whisper ...
    β€”
    """)
    transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
    translate_text_result = whisper(audio, None, "translate", fn_index=0)
    print("transcript: " + transcribe_text_result)
    print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")  
    print("translated: " + translate_text_result)

    return transcribe_text_result, translate_text_result


 
with gr.Blocks() as demo:
    gr.Markdown(
            """
            ## 1. Say what you want:
            """
        )
    with gr.Column():
        with gr.Tab(label="Record audio input", elem_id="record_tab"):
            with gr.Column():
                record_input = gr.Audio(
                                    source="microphone",
                                    type="filepath", 
                                    show_label=False,
                                    elem_id="record_btn"
                                )
                with gr.Row():
                    audio_r_translate = gr.Button("Check Whisper first", elem_id="check_btn_1")              
                    audio_r_direct_sd = gr.Button("Generating Images", elem_id="magic_btn_1")
        
        with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
            with gr.Row():
                guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
                nb_iterations = gr.Slider(10, 50, value = 25, step = 1, label = 'Steps')
                seed = gr.Slider(label = "Seed", minimum = 0, maximum = 2147483647, step = 1, randomize = True)
        
        gr.Markdown(
            """
            ## 2. Check Whisper output:
            """
        )
        
        with gr.Row():
            transcripted_output = gr.Textbox(
                                    label="Transcription in your detected spoken language", 
                                    lines=3,
                                    elem_id="transcripted"
                                )
            translated_output = gr.Textbox(
                                    label="Transcription in your detected spoken language", 
                                    lines=3,
                                    elem_id="translated"
                                )
           
            
                    
        gr.Markdown("""
            ## 3. Wait for Stable Diffusion Results about ~10 seconds
            """
            ) 
        
        sd_output = gr.Gallery().style(grid=2, height="auto")
        audio_r_translate.click(translate_better, 
                                inputs = [
                                    record_input
                                ], 
                                outputs = [
                                    transcripted_output,
                                    translated_output,
                                ])
        audio_r_direct_sd.click(get_images, 
                              inputs = [
                                  translated_output
                                  ], 
                              outputs = sd_output
                          )
    
if __name__ == "__main__":
    demo.queue(max_size=32, concurrency_count=20).launch()