File size: 5,544 Bytes
5fe5ca4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import decord
import numpy as np
import torch
from PIL import Image
import random
from eva_clip.transform import image_transform
image_processor = image_transform(image_size=448, is_train=False)
def preprocess_multimodal(sources, num_segments):
for source in sources:
for sentence in source:
X_token = '<video>'
if X_token in sentence['content']:
replace_token = ""
ns = num_segments
ns = ns // 2 - 1
for _ in range(ns):
replace_token += "<image>"
replace_token += "<eof>"
replace_token += "<image>"
replace_token += "<eov>"
replace_token = '<vi_start>' + replace_token + '<vi_end>'
sentence["content"] = sentence["content"].replace(X_token, replace_token)
return sources
def preprocess(
sources,
tokenizer,
s_id=None,
):
en_qa_templates = [
"Review the given video and answer the question associated with its visual elements.",
"Watch the provided video and offer an accurate response to the related question.",
"Scrutinize the video carefully, identifying relevant details in order to address the linked question.",
"Take a close look at the presented visuals and deliver a precise answer to the corresponding question.",
"Observe the video attentively and accurately respond to the associated question.",
"View the video attentively and provide a suitable answer to the posed question.",
"Examine the video and approach the connected question with an informed response.",
"Assess the displayed video and answer the subsequent question with accuracy.",
"Consider the video content and deliver a relevant answer to the corresponding question.",
"Go through the video, taking into account key aspects, and respond to the question."
]
ch_qa_templates = [
"审阅所提供的视频,并回答与其视觉元素相关的问题。",
"观看所提供的视频,对相关问题给出准确的回答。",
"仔细审查视频,识别相关的细节,回答与之相关的问题。",
"仔细观察所展示的视觉内容,并对相应的问题给出精确的回答。",
"认真观察视频并准确回答相关的问题。",
"详细观看视频,并且对提出的问题给出合适的回答。",
"观察视频并用有依据的回答来解答相关的问题。",
"评估展示的视频,并准确地回答随后的问题。",
"根据视频内容,对相应的问题给出合理的答案。",
"浏览视频,根据其中的关键内容回答问题。",
]
if s_id != None:
index = s_id
else:
index = random.choice(range(len(en_qa_templates)))
system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手,{ch_qa_templates[index]}"""
chat_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>'
+ message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}
{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"""
messages = []
for source in sources:
message = [{'role': 'system', 'content': system_prompt}]
for sentence in source:
message.append(sentence)
messages.append(message)
#input_ids = tokenizer.apply_chat_template(messages, chat_template, add_generation_prompt=True, return_tensors='pt')
input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
return input_ids
def get_index(fps, max_frame, num_segments):
num_frames = max_frame
if num_frames <= num_segments:
out_indices = start_idx + np.array([(idx % num_frames) for idx in range(num_segments)])
out_indices = np.sort(out_indices)
else:
out_indices = np.linspace(0, num_frames-1, num_segments)
durations = [idx.item() / fps for idx in out_indices]
return out_indices.astype(np.int64), durations
def read_video(video_path, num_segments):
vr = decord.VideoReader(video_path)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
total_duration = len(vr) / fps
frame_indices, durations = get_index(fps, max_frame, num_segments)
video = []
for frame_index in frame_indices:
image = Image.fromarray(vr[frame_index].asnumpy())
video.append(image_processor(image).unsqueeze(0))
video = torch.concat(video)
return video, torch.Tensor(durations), total_duration
def get_input(video_path, num_segments, question, history, tokenizer, s_id):
video, durations, total_duration = read_video(video_path, num_segments)
if history == None:
conversations = []
conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
else:
conversations = history
conversations.append({'role': 'user', 'content': question})
sources = [conversations]
sources = preprocess_multimodal(sources, video.shape[0])
input_ids = preprocess(sources, tokenizer, s_id=s_id)
return video, durations, input_ids, conversations
def add_pred_to_history(history, pred):
history.append({'role': 'assistant', 'content': pred})
return history
|