import gradio as gr import os import time from moviepy.editor import * import imageio #from share_btn import community_icon_html, loading_icon_html, share_js #token = os.environ.get('HF_TOKEN') caption = gr.Blocks.load(name="spaces/fffiloni/CoCa-clone") audio_gen = gr.Blocks.load(name="spaces/fffiloni/audioldm-text-to-audio-generation-copy") ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your video, with audioLDM recommendations in mind :)" def extract_video_frames(video_in): # Load the video file clip = VideoFileClip(video_in) # Calculate the total duration of the video total_duration = clip.duration if total_duration > 5: clip = clip.subclip(0,5) total_duration = clip.duration # Set the intervals to extract the frames intervals = [0, 2, 4, total_duration] # Initialize the list to store the extracted frames frames = [] # Iterate through the intervals and extract the frames for i, interval in enumerate(intervals): # Get the frame at the given time frame = clip.get_frame(interval) # Save the frame as JPG image imageio.imwrite(f'frame{i}.jpg', frame) # Add the frame to the list frames.append(f'frame{i}.jpg') print(frames) return frames def input_changes(input_vid): if input_vid == None: return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), video_output.update(value=None), sound_output.update(value=None) else: picked_frames = extract_video_frames(input_vid) caps = [] for one_frame in picked_frames: cap = caption(one_frame, "Nucleus sampling", 1.2, 0.5, 5, 20, fn_index=0) caps.append(f"the sound of {cap}") print(caps) final_cap = '\n then '.join(caps) print(final_cap) print("CoCa caption: '" + final_cap + "' • ") ph_update = "CoCa caption: '" + final_cap + "' • " return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=final_cap), video_output.update(value=None), sound_output.update(value=None) def infer(video_input, manual_caption, duration_in, seed, caption_output): print(duration_in) if manual_caption == "": cap = caption_output #cap = caption(image_input, fn_index=0) #print("CoCa caption: '" + cap + "' • ") #ph_update = "CoCa caption: '" + cap + "' • " else: cap = manual_caption print("manual caption: " + cap) ph_update="" sound = audio_gen(cap, duration_in, 2.5, seed, 3, "audioldm-m-text-ft", fn_index=0) print(sound) # AudioLDM loaded demo returns a video, so we only keep the audio video = VideoFileClip(sound) audio = video.audio audio.write_audiofile("sound.mp3") # Then we put the audio to the original video # Load the input video file video_in = VideoFileClip(video_input) duration = video_in.duration if duration > 5: video_in = video_in.subclip(0,5) new_audio = AudioFileClip("sound.mp3") # Make the audio the same length as the video new_audio = new_audio.set_duration(video_in.duration) # Combine the audio and video result = video_in.set_audio(new_audio) # Save the result result.write_videofile("result.mp4", codec='libx264', audio_codec='aac') #return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True) #return cap, "sound.mp3", gr.Group.update(visible=True) return cap, "result.mp4", "sound.mp3" title = """

Video to Sound Effect

Convert images from video to a corresponding sound effect generated through CoCa Image Captioning & AudioLDM.
This demo is experimental and works only with exactly 5 seconds videos.

""" article = """

You may also like:

""" with gr.Blocks(css="style.css") as demo: with gr.Column(elem_id="col-container"): gr.HTML(title) input_vid = gr.Video(source="upload", type="filepath", elem_id="input-vid") with gr.Column(): manual_cap = gr.Textbox(label="Manual Video description (optional)", lines=3, placeholder=ph_message) with gr.Row(): duration_in = gr.Slider(interactive=False, minimum=5, maximum=10, step=5, value=5, label="Duration") seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1) caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption") video_output = gr.Video(label="Result", elem_id="video-output") sound_output = gr.Audio() #debug = gr.Textbox() generate = gr.Button("Generate SFX from Video") #with gr.Group(elem_id="share-btn-container", visible=False) as share_group: # community_icon = gr.HTML(community_icon_html) # loading_icon = gr.HTML(loading_icon_html) # share_button = gr.Button("Share to community", elem_id="share-btn") gr.HTML(article) change_out = [manual_cap, caption_output, video_output, sound_output] input_vid.change(input_changes, input_vid, change_out, queue=False) generate.click(infer, inputs=[input_vid, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, video_output, sound_output], api_name="v2fx") #share_button.click(None, [], [], _js=share_js) demo.queue(max_size=32).launch(debug=True)