File size: 5,307 Bytes
372395e
caed802
b19e4a9
f694503
04d2706
 
caed802
0f097d0
8e35aef
372395e
41b5a1b
cd970d9
2fc2b20
ce3ce04
2a2bbb7
347dc6b
2a2bbb7
4e0c414
0f097d0
 
ce3ce04
8e35aef
347dc6b
68b4d15
8c7a831
eb1af87
b6e8417
68b4d15
 
0f097d0
 
cb934a1
 
a89612e
41b5a1b
b5357a4
541cb6f
 
d8e7ff1
8e35aef
 
 
 
 
 
71f4435
8e35aef
372395e
79b4496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f097d0
79b4496
 
 
 
 
 
 
 
 
 
 
e7c2915
 
 
 
 
 
 
 
 
 
 
 
12bd467
 
 
 
 
 
e7c2915
 
 
79b4496
 
372395e
 
 
79b4496
372395e
 
7acb3e3
be460ce
 
 
 
 
 
8e6038a
8e35aef
 
570b690
79b4496
53f5458
 
 
 
 
79b4496
7acb3e3
3e7a6f1
edbc703
2fc2b20
347dc6b
a89612e
8e35aef
bc6b39c
372395e
a63d987
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
import os
import time
from moviepy.editor import *
from share_btn import community_icon_html, loading_icon_html, share_js

token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/laion/CoCa")
audio_gen = gr.Blocks.load(name="spaces/haoheliu/audioldm-text-to-audio-generation")

ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"

def input_changes(input_img):
    
    if input_img == None:
        return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), sound_output.update(value=None)
    else:
        cap = caption(input_img, "Beam search", 1.2, 0.5, 5, 20, fn_index=0)
        print("CoCa caption: '" + cap + "' β€’ ")
        ph_update = "CoCa caption: '" + cap + "' β€’ "
        
        return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=cap), sound_output.update(value=None)
 
def infer(image_input, manual_caption, duration_in, seed, caption_output):
    
    print(duration_in)
    if manual_caption == "":
        cap = caption_output
        #cap = caption(image_input, fn_index=0)
        #print("CoCa caption: '" + cap + "' β€’ ")
        #ph_update = "CoCa caption: '" + cap + "' β€’ "
    else:
        cap = manual_caption
        print("manual caption: " + cap)
        ph_update=""
    
    sound = audio_gen(cap, duration_in, 2.5, seed, 3, "audioldm-m-text-ft", fn_index=0)

    print(sound)
  

    video = VideoFileClip(sound)
    audio = video.audio

    audio.write_audiofile("sound.mp3")
    #return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)
    return cap, "sound.mp3", gr.Group.update(visible=True)

title = """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
            Image to Sound Effect
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Convert an image to a corresponding sound effect generated through CoCa Image Captioning & AudioLDM
        </p>
    </div>
"""

article = """
    
    <div class="footer">
        <p>
         
        Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates πŸ€—
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
        <p>You may also like: </p>
        
        <div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
            
            <svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
                 </a>
            </svg>

            <svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue" src="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue.png" height="20"/>
                 </a>
            </svg>
            
        </div>
    </div>
"""

with gr.Blocks(css="style.css") as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
    
        input_img = gr.Image(type="filepath", elem_id="input-img")
        
        with gr.Column():
            manual_cap = gr.Textbox(label="Manual Image description (optional)", lines=3, placeholder=ph_message)
            with gr.Row():
                duration_in = gr.Slider(minimum=5, maximum=10, step=5, value=5, label="Duration")
                seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
        
        caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
        sound_output = gr.Audio(label="Result", elem_id="sound-output")
        #debug = gr.Textbox()
        generate = gr.Button("Generate SFX from Image")

        with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
            community_icon = gr.HTML(community_icon_html)
            loading_icon = gr.HTML(loading_icon_html)
            share_button = gr.Button("Share to community", elem_id="share-btn")

        gr.HTML(article)

    change_out = [manual_cap, caption_output, sound_output]
    input_img.change(input_changes, input_img, change_out, queue=False)
    

    
    generate.click(infer, inputs=[input_img, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, sound_output, share_group], api_name="i2fx")
    share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=32).launch(debug=True)