File size: 2,940 Bytes
7ec133b
 
 
 
 
feb7d12
cb78e99
7ec133b
9916357
 
c52e238
 
 
 
 
 
 
 
7ec133b
f8dcf83
a4115fd
 
 
f8dcf83
7ec133b
702cb53
89e1517
 
a58a8ea
7ec133b
5b16922
89e1517
7ec133b
89e1517
 
 
702cb53
89e1517
 
7ec133b
89e1517
7ec133b
a4001c8
702cb53
7ec133b
 
9916357
7ec133b
 
 
702cb53
bed8af5
865f315
7ec133b
9916357
9da530e
14273c5
7ec133b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

#Setting device to cuda
torch.set_default_device("cuda")

torch.hub.download_url_to_file('https://github.com/manishkumart/SparrowVQE/blob/main/data/Images/week_01/week_01_page_024.png', 'week_01_page_024.png')

# # Ensure GPU usage if available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# torch.set_default_tensor_type('torch.cuda.FloatTensor' if device=='cuda' else 'torch.FloatTensor')

torch.set_default_tensor_type('torch.cuda.FloatTensor')



# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
                                             torch_dtype=torch.float16, 
                                             device_map="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)

def predict_answer(image, question, max_tokens):
    #Set inputs
    text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
    image = image.convert("RGB")
    
    input_ids = tokenizer(text, return_tensors='pt').input_ids.to('cuda')
    image_tensor = model.image_preprocess(image)
    
    #Generate the answer
    output_ids = model.generate(
        input_ids,
        max_new_tokens=max_tokens,
        images=image_tensor,
        use_cache=True)[0]
    
    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

def gradio_predict(image, question, max_tokens):
    answer = predict_answer(image, question, max_tokens)
    return answer

examples = [["week_01_page_024.png", "Can you explain the slide?"]]
# Define the Gradio interface
iface = gr.Interface(
    fn=gradio_predict,
    inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), 
            gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
            gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
    outputs=gr.TextArea(label="Answer"),
    examples=examples
    title="Sparrow - Tiny 3B | Visual Question Answering",
    description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
)

# Launch the app
iface.queue().launch(debug=True)