File size: 2,801 Bytes
6df828c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import glob
from collections import namedtuple
from PIL import Image

from embeddings import FaissIndex, VectorSearch


class Summary:
    def __init__(self, video_dir, llm):
        self.video_dir = video_dir
        self.llm = llm
        self.vs = VectorSearch()

    def flatten_list(self, s):
        if s == []:
            return s
        if isinstance(s[0], list):
            return self.flatten_list(s[0]) + self.flatten_list(s[1:])
        return s[:1] + self.flatten_list(s[1:])

    def parse_history(self):
        history = []
        with open(f"{self.video_dir}/history.txt") as f:
            for line in f:
                history.append(line.strip())

        history_proc = []
        proc = lambda x: list(map(str.strip, x.strip().split(",")))

        Record = namedtuple("Record", "frame places objects activities".split(" "))
        for hist in history:
            hist_list = hist.split(":")
            flat = self.flatten_list([x.split(".") for x in hist_list])
            frame = flat[0]

            places = proc(flat[3])
            objects = proc(flat[5])
            activities = proc(flat[-1])
            history_proc.append(Record(*[frame, places, objects, activities]))

        return history_proc

    def create_prompts(self, history_proc):
        split_idx = [i for i in range(len(history_proc)) if i % 5 == 0] + [
            len(history_proc)
        ]
        range_idx = [(split_idx[x - 1], split_idx[x]) for x in range(1, len(split_idx))]
        prompts = []
        for r in range_idx:
            prompts.append(self.vs.prompt_summary(history_proc[r[0] : r[1]]))

        return prompts

    def call_model(self, prompts):
        results = []
        for prompt in prompts:
            results.append(self.llm(prompt)[0]["generated_text"])

        return zip(prompts, results)

    def generate_summaries(self):
        history_proc = self.parse_history()
        prompts = self.create_prompts(history_proc)
        results = self.call_model(prompts)
        return results


class VideoSearch:
    def __init__(self, video_dir, vlm, llm=None):
        self.video_dir = video_dir
        self.fi = FaissIndex(faiss_index_location=f"{self.video_dir}/video.index")
        self.vlm = vlm
        self.llm = llm

    def find_nearest_frames(self, query):
        test = self.vlm.get_text_emb(query)
        D, I, frames = self.fi.search(test)
        return D, frames

    def get_images(self, frames, k=5):
        images = []
        for frame in frames[:k]:
            loc = glob.glob(f"{self.video_dir}/*_{frame}.jpg")[0]
            images.append(Image.open(loc))

        return images

    def search_engine(self, query):

        D, frames = self.find_nearest_frames(query)
        images = self.get_images(frames)

        return images