Spaces:

Bton
/

Image2AudioButBetter

Runtime error

App Files Files Community

Bton commited on Feb 9

Commit

6b9e278

•

1 Parent(s): 1d55143

Create app.py

Browse files

Files changed (1) hide show

app.py +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gradio as gr
+from gradio_client import Client
+import json
+import re
+from moviepy.editor import VideoFileClip
+from moviepy.audio.AudioClip import AudioClip
+def extract_audio(video_in):
+    input_video = video_in
+    output_audio = 'audio.wav'
+    # Open the video file and extract the audio
+    video_clip = VideoFileClip(input_video)
+    audio_clip = video_clip.audio
+    # Save the audio as a .wav file
+    audio_clip.write_audiofile(output_audio, fps=44100)  # Use 44100 Hz as the sample rate for .wav files
+    print("Audio extraction complete.")
+    return 'audio.wav'
+def get_caption_from_kosmos(image_in):
+    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
+    kosmos2_result = kosmos2_client.predict(
+        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
+        "Detailed",	# str in 'Description Type' Radio component
+        fn_index=4
+    )
+    print(f"KOSMOS2 RETURNS: {kosmos2_result}")
+    with open(kosmos2_result[1], 'r') as f:
+        data = json.load(f)
+    reconstructed_sentence = []
+    for sublist in data:
+        reconstructed_sentence.append(sublist[0])
+    full_sentence = ' '.join(reconstructed_sentence)
+    #print(full_sentence)
+    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
+    pattern = r'^Describe this image in detail:\s*(.*)$'
+    # Apply the regex pattern to extract the description text.
+    match = re.search(pattern, full_sentence)
+    if match:
+        description = match.group(1)
+        print(description)
+    else:
+        print("Unable to locate valid description.")
+    # Find the last occurrence of "."
+    last_period_index = description.rfind('.')
+    # Truncate the string up to the last period
+    truncated_caption = description[:last_period_index + 1]
+    # print(truncated_caption)
+    print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
+    return truncated_caption
+def get_caption(image_in):
+    client = Client("https://vikhyatk-moondream1.hf.space/")
+    result = client.predict(
+		image_in,	# filepath  in 'image' Image component
+		"provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context",	# str  in 'Question' Textbox component
+		api_name="/answer_question"
+    )
+    print(result)
+    return result
+def get_audioldm(prompt):
+    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
+    result = client.predict(
+        prompt,
+        "low quality",
+        10,
+        3.5,
+        45,
+        3,
+        fn_index=1
+    )
+    print(result)
+    audio_result = extract_audio(result)
+    return audio_result
+def infer(image_in, chosen_model):
+    caption = get_caption(image_in)
+    if chosen_model == "MAGNet" :
+        magnet_result = get_magnet(caption)
+        return magnet_result
+    elif chosen_model == "AudioLDM-2" :
+        audioldm_result = get_audioldm(caption)
+        return audioldm_result
+    elif chosen_model == "AudioGen" :
+        audiogen_result = get_audiogen(caption)
+        return audiogen_result
+css="""
+#col-container{
+    margin: 0 auto;
+    max-width: 800px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.HTML("""
+        <h2 style="text-align: center;">
+            Image to SFX
+        </h2>
+        <p style="text-align: center;">
+            Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
+        </p>
+        """)
+        with gr.Column():
+            image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="/content/1")
+            with gr.Row():
+                chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2")
+                submit_btn = gr.Button("Submit")
+        with gr.Column():
+            audio_o = gr.Audio(label="Audio output")
+    submit_btn.click(
+        fn=infer,
+        inputs=[image_in, chosen_model],
+        outputs=[audio_o],
+        concurrency_limit = 4
+    )
+demo.queue(max_size=10).launch(debug=True)