File size: 5,089 Bytes
372395e
caed802
b19e4a9
f694503
04d2706
 
caed802
372395e
caed802
372395e
41b5a1b
cd970d9
a89612e
b154383
a89612e
 
b154383
a89612e
 
 
cd970d9
68b4d15
eb1af87
b6e8417
68b4d15
 
 
 
cb934a1
 
a89612e
41b5a1b
b5357a4
be460ce
6cc068b
71f4435
 
372395e
79b4496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c215d
79b4496
 
 
 
 
 
 
 
 
 
 
e7c2915
 
 
 
 
 
 
 
 
 
 
 
12bd467
 
 
 
 
 
e7c2915
 
 
79b4496
 
372395e
 
 
79b4496
372395e
 
7acb3e3
be460ce
 
 
 
 
 
8e6038a
f694503
372395e
570b690
79b4496
53f5458
 
 
 
 
79b4496
7acb3e3
59a139e
a89612e
 
71f4435
bc6b39c
372395e
a63d987
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import os
import time
from moviepy.editor import *
from share_btn import community_icon_html, loading_icon_html, share_js

token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/SRDdev/Image-Caption")
audio_gen = gr.Blocks.load(name="spaces/fffiloni/audioldm-text-to-audio-generation-clone", api_key=token)

ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"

def clean(input_img):
    if input_img == None:
        return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), sound_output.update(value=None)
    else:
        cap = caption(input_img, fn_index=0)
        print("gpt2 caption: '" + cap + "' β€’ ")
        ph_update = "gpt2 caption: '" + cap + "' β€’ "
        return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=cap), sound_output.update(value=None)

def infer(image_input, manual_caption, duration_in, seed, caption_output):
    print(duration_in)
    if manual_caption == "":
        cap = caption_output
        #cap = caption(image_input, fn_index=0)
        #print("gpt2 caption: '" + cap + "' β€’ ")
        #ph_update = "gpt2 caption: '" + cap + "' β€’ "
    else:
        cap = manual_caption
        print("manual caption: " + cap)
        ph_update=""
    
    sound = audio_gen(cap, duration_in, 2.5, seed, 3, fn_index=0)
    
    #return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)
    return cap, sound[1], gr.Group.update(visible=True)

title = """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
            Image to Sound Effect
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Convert an image to a corresponding sound effect generated through GPT2 Image Captioning & AudioLDM
        </p>
    </div>
"""

article = """
    
    <div class="footer">
        <p>
         
        Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates πŸ€—
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
        <p>You may also like: </p>
        
        <div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
            
            <svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
                 </a>
            </svg>

            <svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue" src="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue.png" height="20"/>
                 </a>
            </svg>
            
        </div>
    </div>
"""

with gr.Blocks(css="style.css") as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
    
        input_img = gr.Image(type="filepath", elem_id="input-img")
        
        with gr.Column():
            manual_cap = gr.Textbox(label="Manual Image description (optional)", lines=3, placeholder=ph_message)
            with gr.Row():
                duration_in = gr.Slider(minimum=5, maximum=10, step=5, value=5, label="Duration")
                seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
        
        caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
        sound_output = gr.Audio(label="Result", elem_id="sound-output")
        
        generate = gr.Button("Generate SFX from Image")

        with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
            community_icon = gr.HTML(community_icon_html)
            loading_icon = gr.HTML(loading_icon_html)
            share_button = gr.Button("Share to community", elem_id="share-btn")

        gr.HTML(article)

    clean_out = [manual_cap, caption_output, sound_output]
    input_img.change(clean, input_img, clean_out)
    
    generate.click(infer, inputs=[input_img, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, sound_output, share_group], api_name="i2fx")
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=32).launch(debug=True)