fffiloni's picture
Update app.py
7e249f7 verified
raw history blame
No virus
5.78 kB
import gradio as gr
from gradio_client import Client
import cv2
from moviepy.editor import *
# 1. extract and store 1 image every 5 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption
import re
import torch
from transformers import pipeline
zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
standard_sys = f"""
You will be provided a list of visual events, and an audio description. All these informations come from a single video.
List of visual events are actually extracted from this video every 12 frames.
These visual infos are extracted from a video that is usually a short sequenc.
Repetitive descrption of the same person or group of subject means that it is the same person/subject, filmed without cut.
Audio events are actually the description from the audio of the video.
Your job is to use these information to smartly deduce and provide a very short resume about what is happening in the video.
"""
def extract_frames(video_in, interval=24, output_format='.jpg'):
"""Extract frames from a video at a specified interval and store them in a list.
Args:
- video_in: string or path-like object pointing to the video file
- interval: integer specifying how many frames apart to extract images (default: 5)
- output_format: string indicating desired format for saved images (default: '.jpg')
Returns:
A list of strings containing paths to saved images.
"""
# Initialize variables
vidcap = cv2.VideoCapture(video_in)
frames = []
count = 0
# Loop through frames until there are no more
while True:
success, image = vidcap.read()
# Check if successful read and not past end of video
if success:
print('Read a new frame:', success)
# Save current frame if it meets criteria
if count % interval == 0:
filename = f'frame_{count // interval}{output_format}'
frames.append(filename)
cv2.imwrite(filename, image)
print(f'Saved {filename}')
# Increment counter
count += 1
# Break out of loop when done reading frames
else:
break
# Close video capture
vidcap.release()
print('Done extracting frames!')
return frames
def process_image(image_in):
client = Client("https://vikhyatk-moondream1.hf.space/")
result = client.predict(
image_in, # filepath in 'image' Image component
"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
api_name="/answer_question"
#api_name="/predict"
)
print(result)
return result
def extract_audio(video_path):
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile("output_audio.mp3")
return "output_audio.mp3"
def get_salmonn(audio_in):
salmonn_prompt = "Please describe the audio"
client = Client("fffiloni/SALMONN-7B-gradio")
result = client.predict(
audio_in, # filepath in 'Audio' Audio component
salmonn_prompt, # str in 'User question' Textbox component
4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component
1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component
api_name="/gradio_answer"
)
print(result)
return result
def llm_process(user_prompt):
agent_maker_sys = standard_sys
instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
prompt = f"{instruction.strip()}\n{user_prompt}</s>"
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
print(f"SUGGESTED video description: {cleaned_text}")
return cleaned_text.lstrip("\n")
def infer(video_in):
# Extract frames from a video
frame_files = extract_frames(video_in)
# Process each extracted frame and collect results in a list
processed_texts = []
for frame_file in frame_files:
text = process_image(frame_file)
processed_texts.append(text)
print(processed_texts)
# Convert processed_texts list to a string list with line breaks
string_list = '\n'.join(processed_texts)
# Extract audio from video
extracted_audio = extract_audio(video_in)
print(extracted_audio)
# Get description of audio content
audio_content_described = get_salmonn(extracted_audio)
# Assemble captions
formatted_captions = f"""
### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
"""
print(formatted_captions)
# Send formatted captions to LLM
video_description_from_llm = llm_process(formatted_captions)
return video_description_from_llm
with gr.Blocks() as demo :
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h2 style="text-align: center;">Video description</h2>
""")
video_in = gr.Video(label="Video input")
submit_btn = gr.Button("Submit")
video_description = gr.Textbox(label="Video description")
submit_btn.click(
fn = infer,
inputs = [video_in],
outputs = [video_description]
)
demo.queue().launch()