File size: 7,832 Bytes
c59fc6b
 
9347b1e
139bf60
c59fc6b
 
97bc44b
d26dd8d
c59fc6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518eb6e
c59fc6b
139bf60
c59fc6b
 
 
 
08ec8d2
 
 
 
 
c59fc6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567d6d1
c59fc6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec4889b
 
 
 
 
 
 
 
 
 
 
 
c59fc6b
 
 
 
 
 
e57843e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c59fc6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43dee29
9347b1e
 
 
 
 
 
c59fc6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97bc44b
c59fc6b
e57843e
c59fc6b
20bef95
97bc44b
c59fc6b
e57843e
 
c59fc6b
 
97bc44b
c59fc6b
567d6d1
97bc44b
c59fc6b
 
 
 
 
97bc44b
c59fc6b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import streamlit as st
import torch
import copy
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from typing import Optional
from my_model.utilities import free_gpu_resources
from my_model.captioner.image_captioning import ImageCaptioningModel
from my_model.object_detection import ObjectDetector


class KBVQA():

    def __init__(self):
        self.kbvqa_model_name = "m7mdal7aj/fine_tunned_llama_2_merged"
        self.quantization='4bit'
        self.bnb_config = self.create_bnb_config()
        self.max_context_window = 4096
        self.add_eos_token = False
        self.trust_remote = False
        self.use_fast = True
        self.kbvqa_tokenizer = None
        self.captioner = None
        self.detector = None
        self.detection_model = None
        self.kbvqa_model = None
        self.access_token = os.getenv("HUGGINGFACE_TOKEN")
      #  self.kbvqa_model_loaded = self.all_models_loaded() 

 
    def create_bnb_config(self) -> BitsAndBytesConfig:
        """
        Creates a BitsAndBytes configuration based on the quantization setting.
        Returns:
            BitsAndBytesConfig: Configuration for BitsAndBytes optimized model.
        """
        if self.quantization == '4bit':
            return BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16
            )
        elif self.quantization == '8bit':
            return BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_use_double_quant=True,
                bnb_8bit_quant_type="nf4",
                bnb_8bit_compute_dtype=torch.bfloat16
            )


    def load_caption_model(self):
        self.captioner = ImageCaptioningModel()
        self.captioner.load_model()

    def get_caption(self, img):

        return self.captioner.generate_caption(img)

    def load_detector(self, model):

        self.detector = ObjectDetector()
        self.detector.load_model(model)

    def detect_objects(self, img, threshold=0.2):
        image = self.detector.process_image(img)
        detected_objects_string, detected_objects_list = self.detector.detect_objects(image, threshold=threshold)
        image_with_boxes = self.detector.draw_boxes(img, detected_objects_list)
        return image_with_boxes, detected_objects_string

    def load_fine_tuned_model(self):

        self.kbvqa_model = AutoModelForCausalLM.from_pretrained(self.kbvqa_model_name, 
                                                                device_map="auto", 
                                                                low_cpu_mem_usage=True, 
                                                                quantization_config=self.bnb_config,
                                                                token=self.access_token)
        
        self.kbvqa_tokenizer = AutoTokenizer.from_pretrained(self.kbvqa_model_name, 
                                                             use_fast=self.use_fast, 
                                                             low_cpu_mem_usage=True, 
                                                             trust_remote_code=self.trust_remote, 
                                                             add_eos_token=self.add_eos_token,
                                                             token=self.access_token)


    @property
    def all_models_loaded(self):
        return self.kbvqa_model is not None and self.captioner is not None and self.detector is not None

    def force_reload_model(self):
        free_gpu_resources()
        if self.kbvqa_model is not None:
            del self.kbvqa_model
        if self.captioner is not None:
            del self.captioner
        if self.detector is not None:
            del self.detector

        free_gpu_resources()

        

        
            



    def format_prompt(self, current_query, history = None , sys_prompt=None, caption=None, objects=None):

        if sys_prompt is None:
            sys_prompt = "You are a helpful, respectful and honest assistant for visual question answering. you are provided with a caption of an image and a list of objects detected in the image along with their bounding boxes and level of certainty, you will output an answer to the given questions in no more than one sentence. Use logical reasoning to reach to the answer, but do not output your reasoning process unless asked for it. If provided, you will use the [CAP] and [/CAP] tags to indicate the begining and end of the caption respectively. If provided you will use the [OBJ] and [/OBJ] tags to indicate the begining and end of the list of detected objects in the image along with their bounding boxes respectively.if provided, you will use [QES] and [/QES] tags to indicate the begining and end of the question respectively."
    
        B_SENT = '<s>'
        E_SENT = '</s>'
        B_INST = '[INST]'
        E_INST = '[/INST]'
        B_SYS = '<<SYS>>\n'
        E_SYS = '\n<</SYS>>\n\n'
        B_CAP = '[CAP]'
        E_CAP = '[/CAP]'
        B_QES = '[QES]'
        E_QES = '[/QES]'
        B_OBJ = '[OBJ]'
        E_OBJ = '[/OBJ]'
    
    
        current_query = current_query.strip()
        sys_prompt = sys_prompt.strip()
        
        if history is None:
            if objects is None:
                p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_QES}{current_query}{E_QES}{E_INST}"""
            else:
              p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_OBJ}{objects}{E_OBJ}{B_QES}taking into consideration the objects with high certainty, {current_query}{E_QES}{E_INST}"""
        else:
            p = f"""{history}\n{B_SENT}{B_INST} {B_QES}{current_query}{E_QES}{E_INST}"""
        
        
        return p
       

    def generate_answer(self, question, image):
        img = copy.deepcopy(image)
        st.write('image being detcted')
        st.image(img)
        caption = self.get_caption(img)
        image_with_boxes, detected_objects_str = self.detect_objects(img)
        st.write(detected_objects_str)
        prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
        num_tokens = len(self.kbvqa_tokenizer.tokenize(prompt))
        if num_tokens > self.max_context_window:
            st.write(f"Prompt too long with {num_tokens} tokens, consider increasing the confidence threshold for the object detector")
            return

        model_inputs = self.kbvqa_tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to('cuda')
        input_ids = model_inputs["input_ids"]
        output_ids = self.kbvqa_model.generate(input_ids)
        index = input_ids.shape[1] # needed to avoid printing the input prompt
        history = self.kbvqa_tokenizer.decode(output_ids[0], skip_special_tokens=False)
        output_text = self.kbvqa_tokenizer.decode(output_ids[0][index:], skip_special_tokens=True)

        return output_text.capitalize()

def prepare_kbvqa_model(detection_model):
    free_gpu_resources()
    kbvqa = KBVQA()
    kbvqa.detection_model = detection_model
    # Progress bar for model loading
    with st.spinner('Loading model...'):
        
        progress_bar = st.progress(0)

        kbvqa.load_detector(kbvqa.detection_model)
        progress_bar.progress(33)
        kbvqa.load_caption_model()
        free_gpu_resources()
        progress_bar.progress(66)
        kbvqa.load_fine_tuned_model()
        free_gpu_resources()
        progress_bar.progress(100)
    
    if kbvqa.all_models_loaded:
        st.success('Model loaded successfully!')
        kbvqa.kbvqa_model.eval()
        free_gpu_resources()
        return kbvqa