File size: 1,348 Bytes
a0a5594
87ce8f2
 
 
 
8a770f2
024e8dc
87ce8f2
 
 
 
 
 
 
 
 
 
097ab33
87ce8f2
 
024e8dc
 
2d61ab3
024e8dc
 
 
8a770f2
3e3aa94
 
024e8dc
3e3aa94
 
 
 
 
 
8a770f2
3e3aa94
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from typing import Dict, List, Any
import sys
sys.path.append('./')
from videollama2 import model_init, mm_infer
from videollama2.utils import disable_torch_init
import logging
import numpy as np

class EndpointHandler:
    def __init__(self, path: str = ""):
        """
        Initialize the handler by loading the model and any other necessary components.
        
        Args:
            path (str): The path to the model or other necessary files.
        """
        disable_torch_init()
        self.model_path = 'Aliayub1995/VideoLLaMA2-7B'
        self.model, self.processor, self.tokenizer = model_init(self.model_path)

    def __call__(self, video_tensor: np.ndarray) -> List[Dict[str, Any]]:
        logging.info("Received video tensor")  # Debugging: Confirm video tensor received

        # Default values
        modal = "video"
        instruct = "Can you explain each scene and provide the exact time of the video in which it happened in this format [start_time: end_time]: Description, [start_time: end_time]: Description ..."

        # Perform inference
        output = mm_infer(
            self.processor[modal](video_tensor), 
            instruct, 
            model=self.model, 
            tokenizer=self.tokenizer, 
            do_sample=False, 
            modal=modal
        )

        return [{"output": output}]