File size: 5,768 Bytes
7ec133b
 
 
 
9db455c
 
 
 
 
7ec133b
 
c52e238
fb293c4
c52e238
 
 
7ec133b
f8dcf83
a4115fd
 
 
f8dcf83
7ec133b
9db455c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ec133b
9db455c
 
7ec133b
9db455c
 
 
 
 
 
7ec133b
9db455c
7ec133b
a4001c8
702cb53
7ec133b
7db5813
4537acb
d8b8982
 
 
 
 
 
 
 
7ec133b
 
 
 
9db455c
 
bed8af5
9db455c
7ec133b
d8b8982
c9a7831
 
7ec133b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import magic
import mimetypes
import cv2
import numpy as np
import io


# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"



# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
                                             torch_dtype=torch.float16, 
                                             device_map="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)


def get_file_type_from_bytes(file_bytes):
    """Determine whether a file is an image or a video based on its MIME type from bytes."""
    mime = magic.Magic(mime=True)
    mimetype = mime.from_buffer(file_bytes)
    if mimetype.startswith('image'):
        return 'image'
    elif mimetype.startswith('video'):
        return 'video'
    return 'unknown'

def process_video(video_bytes):
    """Extracts frames from the video, 1 per second."""
    video = cv2.VideoCapture(io.BytesIO(video_bytes))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames = []
    success, frame = video.read()
    while success:
        frames.append(frame)
        for _ in range(int(fps)):  # Skip fps frames
            success, frame = video.read()
    video.release()
    return frames[:4]  # Return the first 4 frames


def predict_answer(file, question, max_tokens=100):

    file_type = get_file_type_from_bytes(file)
    
    if file_type == 'image':
        # Process as an image
        image = Image.open(io.BytesIO(file))
        frame = image.convert("RGB")
        input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
        image_tensor = model.image_preprocess(frame)
        
        #Generate the answer
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            images=image_tensor,
            use_cache=True)[0]
        
        return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
        
    elif file_type == 'video':
        # Process as a video
        frames = process_video(file)
        answers = []
        for frame in frames:
            frame = Image.open(frame).convert("RGB")
            input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
            image_tensor = model.image_preprocess(frame)
            
            # Generate the answer
            output_ids = model.generate(
                input_ids,
                max_new_tokens=max_tokens,
                images=image_tensor,
                use_cache=True)[0]
            
            answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
            answers.append(answer)
        return "\n".join(answers)
        
    else:
        return "Unsupported file type. Please upload an image or video."
        

    
    
   
# def predict_answer(image, question, max_tokens=100):
#     #Set inputs
#     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
#     image = image.convert("RGB")
    
#     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
#     image_tensor = model.image_preprocess(image)
    
#     #Generate the answer
#     output_ids = model.generate(
#         input_ids,
#         max_new_tokens=max_tokens,
#         images=image_tensor,
#         use_cache=True)[0]
    
#     return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

def gradio_predict(image, question, max_tokens):
    answer = predict_answer(image, question, max_tokens)
    return answer
    
    
# examples = [["data/week_01_page_024.png", 'Can you explain the slide?',100], 
#             ["data/week_03_page_091.png", 'Can you explain the slide?',100], 
#             ["data/week_01_page_062.png", 'Are the training images labeled?',100], 
#             ["data/week_05_page_027.png", 'What is meant by eigenvalue multiplicity?',100],
#             ["data/week_05_page_030.png", 'What does K represent?',100], 
#             ["data/week_15_page_046.png", 'How are individual heterogeneous models trained?',100], 
#             ["data/week_15_page_021.png", 'How does Bagging affect error?',100],
#             ["data/week_15_page_037.png", "What does the '+' and '-' represent?",100]]

# Define the Gradio interface
iface = gr.Interface(
    fn=gradio_predict,
    inputs=[gr.File(label="Upload an Image or Video"),
    # gr.Image(type="pil", label="Upload or Drag an Image"), 
            gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
            gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
    outputs=gr.TextArea(label="Answer"),
    # examples=examples,
    title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",
    # description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
)

# Launch the app
iface.queue().launch(debug=True)