soft-video-understanding-addingaudio

Paused

File size: 9,530 Bytes

ad5a806
027e8a9
 
91c5ebc
b345f1b
 
027e8a9
ad6ae4a
027e8a9
 
 
 
 
b345f1b
 
 
027e8a9
b345f1b
 
027e8a9
b345f1b
003efa5
2aabdcd
003efa5
7c8a760
003efa5
b345f1b
 
39ff4a4
a550e5b
 
 
 
 
 
 
 
 
 
 
63c362f
2c12515
 
 
 
fa29376
2c12515
 
 
b345f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c5ebc
b345f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c5ebc
042d40b
53b4fd0
 
042d40b
 
 
 
 
 
91c5ebc
53b4fd0
91c5ebc
b345f1b
91c5ebc
 
b345f1b
 
 
 
 
 
 
 
53b4fd0
9da309a
042d40b
 
53b4fd0
 
91c5ebc
b345f1b
 
d682a07
ad6ae4a
 
 
 
 
 
 
 
b345f1b
 
933471e
b345f1b
 
 
 
 
 
 
 
 
 
 
 
ad5a806
3b6a87c
b345f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
027e8a9
 
b345f1b
1b6024a
b345f1b
027e8a9
b345f1b
fa29376
b345f1b
 
 
 
 
 
 
 
 
 
 
 
24987c8
ad6ae4a
 
 
fa29376
ad6ae4a
 
 
b345f1b
 
70f5d8a
b345f1b
 
 
 
 
fa29376
933471e
70f5d8a
933471e
027e8a9
7c60508
 
 
24987c8
7c60508
ae0f617
 
 
 
 
7c60508
 
027e8a9
 
fa29376
 
 
4de0933
 
 
fa29376
027e8a9
ad6ae4a
 
2c12515
4de0933
59a74a9
cd5d77e
2c12515
52be6ba
57c5811
3c3adba
52be6ba
2c12515
 
ad6ae4a
 
 
a550e5b
63c362f
 
2c12515
 
63c362f
027e8a9
 
63c362f
027e8a9
 
bbf1141

import spaces
import gradio as gr
from gradio_client import Client
client = Client("https://vikhyatk-moondream1.hf.space/")
import cv2
from moviepy.editor import *

# 1. extract and store 1 image every 24 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption

import re
import torch
from transformers import pipeline

zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")

standard_sys = f"""
You will be provided a list of visual details observed at regular intervals, along with an audio description. These pieces of information originate from a single video. The visual details are extracted from the video at fixed time intervals and represent consecutive frames. Typically, the video consists of a brief sequence showing one or more subjects...

Please note that the following list of image descriptions (visual details) was obtained by extracting individual frames from a continuous video featuring one or more subjects. Depending on the case, all depicted individuals may correspond to the same person(s), with minor variations due to changes in lighting, angle, and facial expressions over time. Regardless, assume temporal continuity among the frames unless otherwise specified.

Audio events are actually the entire scene description based only on the audio of the video. Your job is to integrate these multimodal inputs intelligently and provide a very short resume about what is happening in the origin video. Provide a succinct overview of what you understood.   
"""

def trim_video(input_path, max_duration=10):
    if input_path is not None:
        video_clip = VideoFileClip(input_path)
        output_path = "video_cut_10.mp4"
        if video_clip.duration > max_duration:
            trimmed_clip = video_clip.subclip(0, max_duration)
            trimmed_clip.write_videofile(output_path, audio_codec='aac')
            return output_path
        else:
            return input_path
    else :
        return None

def extract_frames(video_in, output_format='.jpg'):
    # Adjust interval to video length
    video_clip = VideoFileClip(video_in)
    if video_clip.duration <= 5:
        interval = 6
    else :
        interval = 24
    
    """Extract frames from a video at a specified interval and store them in a list.

    Args:
    - video_in: string or path-like object pointing to the video file
    - interval: integer specifying how many frames apart to extract images (default: 5)
    - output_format: string indicating desired format for saved images (default: '.jpg')

    Returns:
    A list of strings containing paths to saved images.
    """

    # Initialize variables
    vidcap = cv2.VideoCapture(video_in)
    frames = []
    count = 0

    # Loop through frames until there are no more
    while True:
        success, image = vidcap.read()

        # Check if successful read and not past end of video
        if success:
            #print('Read a new frame:', success)

            # Save current frame if it meets criteria
            if count % interval == 0:
                filename = f'frame_{count // interval}{output_format}'
                frames.append(filename)
                cv2.imwrite(filename, image)
                print(f'Saved {filename}')

            # Increment counter
            count += 1

        # Break out of loop when done reading frames
        else:
            break

    # Close video capture
    vidcap.release()
    print('Done extracting frames!')

    return frames

'''
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

model_id = "vikhyatk/moondream2"
revision = "2024-03-06"
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
'''

#@spaces.GPU()
def process_image(image_in):
    
    
    result = client.predict(
		image_in,	# filepath  in 'image' Image component
		"Describe precisely the image in one sentence.",	# str  in 'Question' Textbox component
		api_name="/answer_question"
        #api_name="/predict"
    )
    print(result)
    return result
    '''
    image = Image.open(image_in)
    enc_image = model.encode_image(image)
    result = model.answer_question(enc_image, "Describe the image in one sentence.", tokenizer)
    print(result)
    return result
    '''

def extract_audio(video_path):
    video_clip = VideoFileClip(video_path)
    # Check if the video has audio
    if video_clip.audio is not None:
        audio_clip = video_clip.audio
        audio_clip.write_audiofile("output_audio.mp3")
        return "output_audio.mp3"
    else:
        print("The video does not have any audio.")
        return None

def get_salmonn(audio_in):
    salmonn_prompt = "Please describe the audio"
    client = Client("fffiloni/SALMONN-7B-gradio")
    result = client.predict(
    		audio_in,	# filepath  in 'Audio' Audio component
    		salmonn_prompt,	# str  in 'User question' Textbox component
    		4,	# float (numeric value between 1 and 10) in 'beam search numbers' Slider component
    		1,	# float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
    		0.9,	# float (numeric value between 0.1 and 1.0) in 'top p' Slider component
    		api_name="/gradio_answer"
    )
    print(result)
    return result

@spaces.GPU()
def llm_process(user_prompt):
    agent_maker_sys = standard_sys
    
    instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
    
    prompt = f"{instruction.strip()}\n{user_prompt}</s>"    
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
    
    print(f"SUGGESTED video description: {cleaned_text}")
    return cleaned_text.lstrip("\n")

def infer(video_in):
    # Extract frames from a video
    gr.Info("Extracting frames...")
    frame_files = extract_frames(video_in)
    
    # Process each extracted frame and collect results in a list
    gr.Info("Captioning frames ...")
    processed_texts = []
    for frame_file in frame_files:
        text = process_image(frame_file)
        processed_texts.append(text)
    print(processed_texts)

    # Convert processed_texts list to a string list with line breaks
    string_list = '\n'.join(processed_texts)

    # Extract audio from video
    extracted_audio = extract_audio(video_in)

    if extracted_audio is not None :
        print(extracted_audio)

        # Get description of audio content
        gr.Info("Getting audio description from extracted sound ...")
        audio_content_described = get_salmonn(extracted_audio)
    else : 
        audio_content_described = "Video has no sound."

    # Assemble captions
    formatted_captions = f"""
### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
"""
    print(formatted_captions)

    # Send formatted captions to LLM
    gr.Info("Try to provide a video understanding with provided elements ...")
    video_description_from_llm = llm_process(formatted_captions)
    
    return video_description_from_llm

css = """
div#col-container{
    margin: 0 auto;
    max-width: 1280px;
}
div#video-text textarea {
    font-size: 20px;
    line-height: 1.2em;
    font-weight: 600;
}
"""
with gr.Blocks(css=css) as demo :
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h2 style="text-align: center;">Soft Video Understanding</h2>
        <p style="text-align: center;">
            An experiment to try to achieve what i call "soft video understanding" with open-source available models. <br />
            We use moondream1 to caption extracted frames, salmonn to analyze extracted audio, then give visual and audio details to Zephyr which is instructed to resume what it understood.<br />
            Instructions prompt is available for further discussion with the Community. <br />
            Note that audio is crucial for better overall vision. Video longer than 10 seconds will be cut.
        </p>
        """)
        with gr.Row():
            with gr.Column():
                video_in = gr.Video(label="Video input")
                with gr.Accordion("System Instructions (for your curiosity)", open=False):
                    system_instruction = gr.Markdown(
                        value = standard_sys
                    )
                gr.Examples(
                    examples = ["examples/train.mp4", "examples/puppies.mp4", "examples/turtle.mp4"],
                    inputs = [video_in]
                )
            with gr.Column():
                video_cut = gr.Video(label="Video cut to 10 seconds", interactive=False)
                submit_btn = gr.Button("Submit")
                video_description = gr.Textbox(label="Video description", elem_id="video-text")
    
    video_in.change(
        fn = trim_video,
        inputs = [video_in],
        outputs = [video_cut],
        queue = False
    )
    submit_btn.click(
        fn = infer,
        inputs = [video_cut],
        outputs = [video_description]
    )
demo.queue(max_size=10).launch(show_error=True)