Spaces:
Paused
Paused
File size: 2,940 Bytes
7ec133b feb7d12 cb78e99 7ec133b 9916357 c52e238 7ec133b f8dcf83 a4115fd f8dcf83 7ec133b 702cb53 89e1517 a58a8ea 7ec133b 5b16922 89e1517 7ec133b 89e1517 702cb53 89e1517 7ec133b 89e1517 7ec133b a4001c8 702cb53 7ec133b 9916357 7ec133b 702cb53 bed8af5 865f315 7ec133b 9916357 9da530e 14273c5 7ec133b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
#Setting device to cuda
torch.set_default_device("cuda")
torch.hub.download_url_to_file('https://github.com/manishkumart/SparrowVQE/blob/main/data/Images/week_01/week_01_page_024.png', 'week_01_page_024.png')
# # Ensure GPU usage if available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# torch.set_default_tensor_type('torch.cuda.FloatTensor' if device=='cuda' else 'torch.FloatTensor')
torch.set_default_tensor_type('torch.cuda.FloatTensor')
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
def predict_answer(image, question, max_tokens):
#Set inputs
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
image = image.convert("RGB")
input_ids = tokenizer(text, return_tensors='pt').input_ids.to('cuda')
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
def gradio_predict(image, question, max_tokens):
answer = predict_answer(image, question, max_tokens)
return answer
examples = [["week_01_page_024.png", "Can you explain the slide?"]]
# Define the Gradio interface
iface = gr.Interface(
fn=gradio_predict,
inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
outputs=gr.TextArea(label="Answer"),
examples=examples
title="Sparrow - Tiny 3B | Visual Question Answering",
description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
)
# Launch the app
iface.queue().launch(debug=True)
|