Image2Audio

Sleeping

File size: 8,489 Bytes

import os
import gradio as gr
import requests
import gradio as gr
from gradio_client import Client
import json
import re
from moviepy.editor import VideoFileClip
from moviepy.audio.AudioClip import AudioClip

def search_pexels_videos(query):
    API_KEY = os.getenv("API_KEY")
    url = f"https://api.pexels.com/videos/search?query={query}&per_page=80"
    headers = {"Authorization": API_KEY}
    response = requests.get(url, headers=headers)
    data = response.json()

    # 'videos' 키의 존재 여부를 확인하고, 존재하지 않을 경우 빈 리스트 반환
    if 'videos' in data:
        videos_urls = [{"url": video['url'], "image": video['image']} for video in data['videos']]
    else:
        print("No 'videos' key in response data. Returning empty list.")
        videos_urls = []
    return videos_urls


# Pexels 동영상 검색 결과를 HTML 링크로 표시하는 함수
def show_video_search_results(query):
    videos_info = search_pexels_videos(query)
    videos_html = [
        f"<div style='margin: 10px; display: inline-block;'><a href='{video['url']}' target='_blank' style='text-decoration: none;'><img src='{video['image']}' alt='Video thumbnail' style='width: 200px;'><p>View Video</p></a></div>"
        for video in videos_info
    ]
    return "".join(videos_html)

    
# Pexels 이미지 검색 함수
def search_pexels_images(query):
    API_KEY = os.getenv("API_KEY")
    url = f"https://api.pexels.com/v1/search?query={query}&per_page=80"
    headers = {"Authorization": API_KEY}
    response = requests.get(url, headers=headers)
    data = response.json()
    images_urls = [photo['src']['medium'] for photo in data['photos']]
    return images_urls

# Pexels 이미지 검색 결과 표시 함수
def show_search_results(query):
    images_urls = search_pexels_images(query)
    return images_urls

def extract_audio(video_in):
    input_video = video_in
    output_audio = 'audio.wav'
    
    # Open the video file and extract the audio
    video_clip = VideoFileClip(input_video)
    audio_clip = video_clip.audio
    
    # Save the audio as a .wav file
    audio_clip.write_audiofile(output_audio, fps=44100)  # Use 44100 Hz as the sample rate for .wav files  
    print("Audio extraction complete.")

    return 'audio.wav'

def get_caption_from_kosmos(image_in):
    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")

    kosmos2_result = kosmos2_client.predict(
        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
        "Detailed",	# str in 'Description Type' Radio component
        fn_index=4
    )

    print(f"KOSMOS2 RETURNS: {kosmos2_result}")

    with open(kosmos2_result[1], 'r') as f:
        data = json.load(f)
    
    reconstructed_sentence = []
    for sublist in data:
        reconstructed_sentence.append(sublist[0])

    full_sentence = ' '.join(reconstructed_sentence)
    #print(full_sentence)

    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
    pattern = r'^Describe this image in detail:\s*(.*)$'
    # Apply the regex pattern to extract the description text.
    match = re.search(pattern, full_sentence)
    if match:
        description = match.group(1)
        print(description)
    else:
        print("Unable to locate valid description.")

    # Find the last occurrence of "."
    last_period_index = description.rfind('.')

    # Truncate the string up to the last period
    truncated_caption = description[:last_period_index + 1]

    # print(truncated_caption)
    print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
    
    return truncated_caption

def get_caption(image_in):
    client = Client("https://vikhyatk-moondream1.hf.space/")
    result = client.predict(
		image_in,	# filepath  in 'image' Image component
		"Describe precisely the image in one sentence.",	# str  in 'Question' Textbox component
		api_name="/answer_question"
    )
    print(result)
    return result

def get_magnet(prompt):
    amended_prompt = f"{prompt}"
    print(amended_prompt)
    client = Client("https://fffiloni-magnet.hf.space/")
    result = client.predict(
        "facebook/audio-magnet-medium",	# Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium']  in 'Model' Radio component
        "",	# str  in 'Model Path (custom models)' Textbox component
        amended_prompt,	# str  in 'Input Text' Textbox component
        3,	# float  in 'Temperature' Number component
        0.9,	# float  in 'Top-p' Number component
        10,	# float  in 'Max CFG coefficient' Number component
        1,	# float  in 'Min CFG coefficient' Number component
        20,	# float  in 'Decoding Steps (stage 1)' Number component
        10,	# float  in 'Decoding Steps (stage 2)' Number component
        10,	# float  in 'Decoding Steps (stage 3)' Number component
        10,	# float  in 'Decoding Steps (stage 4)' Number component
        "prod-stride1 (new!)",	# Literal['max-nonoverlap', 'prod-stride1 (new!)']  in 'Span Scoring' Radio component
        api_name="/predict_full"
    )
    print(result)
    return result[1]

def get_audioldm(prompt):
    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
    result = client.predict(
        prompt,	# str in 'Input text' Textbox component
        "Low quality. Music.",	# str in 'Negative prompt' Textbox component
        10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
        3.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
        45,	# int | float in 'Seed' Number component
        3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
        fn_index=1
    )
    print(result)
    audio_result = extract_audio(result)
    return audio_result

def get_audiogen(prompt):
    client = Client("https://fffiloni-audiogen.hf.space/")
    result = client.predict(
        prompt,
        10,
        api_name="/infer"
    )
    return result

def infer(image_in, chosen_model):
    caption = get_caption(image_in)
    if chosen_model == "MAGNet" :
        magnet_result = get_magnet(caption)
        return magnet_result
    elif chosen_model == "AudioLDM-2" : 
        audioldm_result = get_audioldm(caption)
        return audioldm_result
    elif chosen_model == "AudioGen" :
        audiogen_result = get_audiogen(caption)
        return audiogen_result

css="""
#col-container{
    margin: 0 auto;
    max-width: 800px;
}
"""

with gr.Blocks() as app:
    with gr.Tabs():
        with gr.TabItem("Image to Audio"):
            with gr.Column():
                gr.Markdown("### Image to Audio")
                image_in = gr.Image(sources=["upload"], type="filepath", label="Image input")
                chosen_model = gr.Radio(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen"], value="AudioLDM-2")
                submit_btn = gr.Button("Submit")
                audio_o = gr.Audio(label="Audio output")
                submit_btn.click(
                    fn=infer,
                    inputs=[image_in, chosen_model],
                    outputs=audio_o
                )

        with gr.TabItem("FREE Image Search"):
            with gr.Column():
                gr.Markdown("### FREE Image Search")
                search_query = gr.Textbox(label="사진 검색")
                search_btn = gr.Button("검색")
                images_output = gr.Gallery(label="검색 결과 이미지")
                search_btn.click(
                    fn=show_search_results,
                    inputs=search_query,
                    outputs=images_output
                )

        with gr.TabItem("FREE Video Search"):
            with gr.Column():
                gr.Markdown("### FREE Video Search")
                video_search_query = gr.Textbox(label="비디오 검색")
                video_search_btn = gr.Button("검색")
                # HTML 컴포넌트로 비디오 검색 결과 표시
                videos_output = gr.HTML(label="검색 결과 동영상")
                video_search_btn.click(
                    fn=show_video_search_results,
                    inputs=video_search_query,
                    outputs=videos_output
                )

app.launch(debug=True)