soft-video-understanding-addingaudio

Paused

App Files Files Community

soft-video-understanding-addingaudio / app.py

fffiloni

Update app.py

7e249f7 verified 4 months ago

raw history blame

No virus

5.78 kB

	import gradio as gr
	from gradio_client import Client
	import cv2
	from moviepy.editor import *

	# 1. extract and store 1 image every 5 images from video input
	# 2. extract audio
	# 3. for each image from extracted_images, get caption from caption model and concatenate into list
	# 4. for audio, ask audio questioning model to describe sound/scene
	# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption

	import re
	import torch
	from transformers import pipeline

	zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
	pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")

	standard_sys = f"""
	You will be provided a list of visual events, and an audio description. All these informations come from a single video.
	List of visual events are actually extracted from this video every 12 frames.
	These visual infos are extracted from a video that is usually a short sequenc.
	Repetitive descrption of the same person or group of subject means that it is the same person/subject, filmed without cut.
	Audio events are actually the description from the audio of the video.
	Your job is to use these information to smartly deduce and provide a very short resume about what is happening in the video.
	"""

	def extract_frames(video_in, interval=24, output_format='.jpg'):
	"""Extract frames from a video at a specified interval and store them in a list.

	Args:
	- video_in: string or path-like object pointing to the video file
	- interval: integer specifying how many frames apart to extract images (default: 5)
	- output_format: string indicating desired format for saved images (default: '.jpg')

	Returns:
	A list of strings containing paths to saved images.
	"""

	# Initialize variables
	vidcap = cv2.VideoCapture(video_in)
	frames = []
	count = 0

	# Loop through frames until there are no more
	while True:
	success, image = vidcap.read()

	# Check if successful read and not past end of video
	if success:
	print('Read a new frame:', success)

	# Save current frame if it meets criteria
	if count % interval == 0:
	filename = f'frame_{count // interval}{output_format}'
	frames.append(filename)
	cv2.imwrite(filename, image)
	print(f'Saved {filename}')

	# Increment counter
	count += 1

	# Break out of loop when done reading frames
	else:
	break

	# Close video capture
	vidcap.release()
	print('Done extracting frames!')

	return frames

	def process_image(image_in):
	client = Client("https://vikhyatk-moondream1.hf.space/")
	result = client.predict(
	image_in, # filepath in 'image' Image component
	"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
	api_name="/answer_question"
	#api_name="/predict"
	)
	print(result)
	return result

	def extract_audio(video_path):
	video_clip = VideoFileClip(video_path)
	audio_clip = video_clip.audio
	audio_clip.write_audiofile("output_audio.mp3")
	return "output_audio.mp3"

	def get_salmonn(audio_in):
	salmonn_prompt = "Please describe the audio"
	client = Client("fffiloni/SALMONN-7B-gradio")
	result = client.predict(
	audio_in, # filepath in 'Audio' Audio component
	salmonn_prompt, # str in 'User question' Textbox component
	4, # float (numeric value between 1 and 10) in 'beam search numbers' Slider component
	1, # float (numeric value between 0.8 and 2.0) in 'temperature' Slider component
	0.9, # float (numeric value between 0.1 and 1.0) in 'top p' Slider component
	api_name="/gradio_answer"
	)
	print(result)
	return result

	def llm_process(user_prompt):
	agent_maker_sys = standard_sys

	instruction = f"""
	<\|system\|>
	{agent_maker_sys}</s>
	<\|user\|>
	"""

	prompt = f"{instruction.strip()}\n{user_prompt}</s>"
	outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
	pattern = r'\<\\|system\\|\>(.*?)\<\\|assistant\\|\>'
	cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)

	print(f"SUGGESTED video description: {cleaned_text}")
	return cleaned_text.lstrip("\n")

	def infer(video_in):
	# Extract frames from a video
	frame_files = extract_frames(video_in)

	# Process each extracted frame and collect results in a list
	processed_texts = []
	for frame_file in frame_files:
	text = process_image(frame_file)
	processed_texts.append(text)
	print(processed_texts)

	# Convert processed_texts list to a string list with line breaks
	string_list = '\n'.join(processed_texts)

	# Extract audio from video
	extracted_audio = extract_audio(video_in)
	print(extracted_audio)

	# Get description of audio content
	audio_content_described = get_salmonn(extracted_audio)

	# Assemble captions
	formatted_captions = f"""
	### Visual events:\n{string_list}\n ### Audio events:\n{audio_content_described}
	"""
	print(formatted_captions)

	# Send formatted captions to LLM
	video_description_from_llm = llm_process(formatted_captions)

	return video_description_from_llm

	with gr.Blocks() as demo :
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h2 style="text-align: center;">Video description</h2>
	""")
	video_in = gr.Video(label="Video input")
	submit_btn = gr.Button("Submit")
	video_description = gr.Textbox(label="Video description")
	submit_btn.click(
	fn = infer,
	inputs = [video_in],
	outputs = [video_description]
	)
	demo.queue().launch()