# Modified from https://github.com/mit-han-lab/llm-awq/blob/main/tinychat/vlm_demo_new.py. import argparse import os from pathlib import Path import numpy as np import pandas as pd import torch from accelerate import load_checkpoint_and_dispatch, PartialState from accelerate.utils import gather_object from decord import VideoReader from PIL import Image from natsort import natsorted from tqdm import tqdm from transformers import AutoConfig, AutoTokenizer import tinychat.utils.constants # from tinychat.models.llava_llama import LlavaLlamaForCausalLM from tinychat.models.vila_llama import VilaLlamaForCausalLM from tinychat.stream_generators.llava_stream_gen import LlavaStreamGenerator from tinychat.utils.conversation_utils import gen_params from tinychat.utils.llava_image_processing import process_images from tinychat.utils.prompt_templates import ( get_image_token, get_prompter, get_stop_token_ids, ) from tinychat.utils.tune import ( device_warmup, tune_llava_patch_embedding, ) from utils.filter import filter from utils.logger import logger gen_params.seed = 1 gen_params.temp = 1.0 gen_params.top_p = 1.0 def extract_uniform_frames(video_path: str, num_sampled_frames: int = 8): vr = VideoReader(video_path) sampled_frame_idx_list = np.linspace(0, len(vr), num_sampled_frames, endpoint=False, dtype=int) sampled_frame_list = [] for idx in sampled_frame_idx_list: sampled_frame = Image.fromarray(vr[idx].asnumpy()) sampled_frame_list.append(sampled_frame) return sampled_frame_list def stream_output(output_stream): for outputs in output_stream: output_text = outputs["text"] output_text = output_text.strip().split(" ") # print(f"output_text: {output_text}.") return " ".join(output_text) def skip(*args, **kwargs): pass def parse_args(): parser = argparse.ArgumentParser(description="Recaption videos with VILA1.5.") parser.add_argument( "--video_metadata_path", type=str, default=None, help="The path to the video dataset metadata (csv/jsonl).", ) parser.add_argument( "--video_path_column", type=str, default="video_path", help="The column contains the video path (an absolute path or a relative path w.r.t the video_folder).", ) parser.add_argument( "--caption_column", type=str, default="caption", help="The column contains the caption.", ) parser.add_argument( "--video_folder", type=str, default="", help="The video folder." ) parser.add_argument("--input_prompt", type=str, default="