import glob from collections import namedtuple from PIL import Image from embeddings import FaissIndex, VectorSearch class Summary: def __init__(self, video_dir, llm): self.video_dir = video_dir self.llm = llm self.vs = VectorSearch() def flatten_list(self, s): if s == []: return s if isinstance(s[0], list): return self.flatten_list(s[0]) + self.flatten_list(s[1:]) return s[:1] + self.flatten_list(s[1:]) def parse_history(self): history = [] with open(f"{self.video_dir}/history.txt") as f: for line in f: history.append(line.strip()) history_proc = [] proc = lambda x: list(map(str.strip, x.strip().split(","))) Record = namedtuple("Record", "frame places objects activities".split(" ")) for hist in history: hist_list = hist.split(":") flat = self.flatten_list([x.split(".") for x in hist_list]) frame = flat[0] places = proc(flat[3]) objects = proc(flat[5]) activities = proc(flat[-1]) history_proc.append(Record(*[frame, places, objects, activities])) return history_proc def create_prompts(self, history_proc): split_idx = [i for i in range(len(history_proc)) if i % 5 == 0] + [ len(history_proc) ] range_idx = [(split_idx[x - 1], split_idx[x]) for x in range(1, len(split_idx))] prompts = [] for r in range_idx: prompts.append(self.vs.prompt_summary(history_proc[r[0] : r[1]])) return prompts def call_model(self, prompts): results = [] for prompt in prompts: results.append(self.llm(prompt)[0]["generated_text"]) return zip(prompts, results) def generate_summaries(self): history_proc = self.parse_history() prompts = self.create_prompts(history_proc) results = self.call_model(prompts) return results class VideoSearch: def __init__(self, video_dir, vlm, llm=None): self.video_dir = video_dir self.fi = FaissIndex(faiss_index_location=f"{self.video_dir}/video.index") self.vlm = vlm self.llm = llm def find_nearest_frames(self, query): test = self.vlm.get_text_emb(query) D, I, frames = self.fi.search(test) return D, frames def get_images(self, frames, k=5): images = [] for frame in frames[:k]: loc = glob.glob(f"{self.video_dir}/*_{frame}.jpg")[0] images.append(Image.open(loc)) return images def search_engine(self, query): D, frames = self.find_nearest_frames(query) images = self.get_images(frames) return images