File size: 5,326 Bytes
372395e caed802 b19e4a9 f694503 04d2706 caed802 6bf55a4 17ec562 372395e 41b5a1b cd970d9 2fc2b20 ce3ce04 2a2bbb7 347dc6b 2a2bbb7 743b8dc 0f097d0 ce3ce04 8e35aef 347dc6b 68b4d15 8c7a831 eb1af87 b6e8417 68b4d15 0f097d0 cb934a1 a89612e 41b5a1b b5357a4 541cb6f d8e7ff1 8e35aef 71f4435 8e35aef 372395e 79b4496 0f097d0 79b4496 e7c2915 12bd467 e7c2915 79b4496 372395e 79b4496 372395e 7acb3e3 be460ce 8e6038a 8e35aef 570b690 79b4496 53f5458 79b4496 7acb3e3 3e7a6f1 edbc703 2fc2b20 347dc6b a89612e 8e35aef bc6b39c 372395e a63d987 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import gradio as gr
import os
import time
from moviepy.editor import *
from share_btn import community_icon_html, loading_icon_html, share_js
token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/fffiloni/CoCa-clone")
audio_gen = gr.Blocks.load(name="spaces/fffiloni/audioldm-text-to-audio-generation-copy")
ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"
def input_changes(input_img):
if input_img == None:
return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), sound_output.update(value=None)
else:
cap = caption(input_img, "Nucleus sampling", 1.2, 0.5, 5, 20, fn_index=0)
print("CoCa caption: '" + cap + "' β’ ")
ph_update = "CoCa caption: '" + cap + "' β’ "
return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=cap), sound_output.update(value=None)
def infer(image_input, manual_caption, duration_in, seed, caption_output):
print(duration_in)
if manual_caption == "":
cap = caption_output
#cap = caption(image_input, fn_index=0)
#print("CoCa caption: '" + cap + "' β’ ")
#ph_update = "CoCa caption: '" + cap + "' β’ "
else:
cap = manual_caption
print("manual caption: " + cap)
ph_update=""
sound = audio_gen(cap, duration_in, 2.5, seed, 3, "audioldm-m-text-ft", fn_index=0)
print(sound)
video = VideoFileClip(sound)
audio = video.audio
audio.write_audiofile("sound.mp3")
#return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)
return cap, "sound.mp3", gr.Group.update(visible=True)
title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Image to Sound Effect
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Convert an image to a corresponding sound effect generated through CoCa Image Captioning & AudioLDM
</p>
</div>
"""
article = """
<div class="footer">
<p>
Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates π€
</p>
</div>
<div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
<p>You may also like: </p>
<div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
<svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">
<a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
<image href="https://img.shields.io/badge/π€ Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/π€ Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
</a>
</svg>
<svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">
<a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
<image href="https://img.shields.io/badge/π€ Spaces-Riffusion-blue" src="https://img.shields.io/badge/π€ Spaces-Riffusion-blue.png" height="20"/>
</a>
</svg>
</div>
</div>
"""
with gr.Blocks(css="style.css") as demo:
with gr.Column(elem_id="col-container"):
gr.HTML(title)
input_img = gr.Image(type="filepath", elem_id="input-img")
with gr.Column():
manual_cap = gr.Textbox(label="Manual Image description (optional)", lines=3, placeholder=ph_message)
with gr.Row():
duration_in = gr.Slider(minimum=5, maximum=10, step=5, value=5, label="Duration")
seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
sound_output = gr.Audio(label="Result", elem_id="sound-output")
#debug = gr.Textbox()
generate = gr.Button("Generate SFX from Image")
with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
community_icon = gr.HTML(community_icon_html)
loading_icon = gr.HTML(loading_icon_html)
share_button = gr.Button("Share to community", elem_id="share-btn")
gr.HTML(article)
change_out = [manual_cap, caption_output, sound_output]
input_img.change(input_changes, input_img, change_out, queue=False)
generate.click(infer, inputs=[input_img, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, sound_output, share_group], api_name="i2fx")
share_button.click(None, [], [], _js=share_js)
demo.queue(max_size=32).launch(debug=True)
|