import gradio as gr from gradio_client import Client import json import re from moviepy.editor import VideoFileClip from moviepy.audio.AudioClip import AudioClip def extract_audio(video_in): input_video = video_in output_audio = 'audio.wav' # Open the video file and extract the audio video_clip = VideoFileClip(input_video) audio_clip = video_clip.audio # Save the audio as a .wav file audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files print("Audio extraction complete.") return 'audio.wav' def get_caption_from_kosmos(image_in): kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") kosmos2_result = kosmos2_client.predict( image_in, # str (filepath or URL to image) in 'Test Image' Image component "Detailed", # str in 'Description Type' Radio component fn_index=4 ) print(f"KOSMOS2 RETURNS: {kosmos2_result}") with open(kosmos2_result[1], 'r') as f: data = json.load(f) reconstructed_sentence = [] for sublist in data: reconstructed_sentence.append(sublist[0]) full_sentence = ' '.join(reconstructed_sentence) #print(full_sentence) # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)... pattern = r'^Describe this image in detail:\s*(.*)$' # Apply the regex pattern to extract the description text. match = re.search(pattern, full_sentence) if match: description = match.group(1) print(description) else: print("Unable to locate valid description.") # Find the last occurrence of "." last_period_index = description.rfind('.') # Truncate the string up to the last period truncated_caption = description[:last_period_index + 1] # print(truncated_caption) print(f"\n—\nIMAGE CAPTION: {truncated_caption}") return truncated_caption def get_caption(image_in): client = Client("https://vikhyatk-moondream1.hf.space/") result = client.predict( image_in, # filepath in 'image' Image component "provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context", # str in 'Question' Textbox component api_name="/answer_question" ) print(result) return result def get_audioldm(prompt): client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/") result = client.predict( prompt, "low quality", 10, 3.5, 45, 3, fn_index=1 ) print(result) audio_result = extract_audio(result) return audio_result def infer(image_in, chosen_model): caption = get_caption(image_in) if chosen_model == "MAGNet" : magnet_result = get_magnet(caption) return magnet_result elif chosen_model == "AudioLDM-2" : audioldm_result = get_audioldm(caption) return audioldm_result elif chosen_model == "AudioGen" : audiogen_result = get_audiogen(caption) return audiogen_result css=""" #col-container{ margin: 0 auto; max-width: 800px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML("""
Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
""") with gr.Column(): image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="doggy.jpg") with gr.Row(): chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2") submit_btn = gr.Button("Submit") with gr.Column(): audio_o = gr.Audio(label="Audio output") submit_btn.click( fn=infer, inputs=[image_in, chosen_model], outputs=[audio_o], concurrency_limit = 4 ) demo.queue(max_size=10).launch(debug=True)