Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import torch.nn.functional as F | |
from transformers import Blip2Processor, Blip2ForConditionalGeneration | |
from PIL import Image | |
from peft import LoraConfig, get_peft_model | |
# Initialize the processor and model | |
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") | |
# model_path = "full-blip2-deit-config-yes-no-2.pth" | |
# model = torch.load("./full-blip2-deit-config-2.pth") | |
# model = torch.load("./full-blip2-deit.pth") # not working - error | |
model = torch.load("./full-blip2-deit-config-free-form-4-ver-2.pth") | |
model.eval() # Set the model to evaluation mode | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
def preprocess_image(image): | |
"""Preprocess the image to match the model's input requirements.""" | |
# Convert PIL image to tensor | |
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device) | |
# Apply specific model's preprocessing | |
patch_embeddings = model.vision_model.embeddings.patch_embeddings.projection(pixel_values) | |
patch_embeddings_flat = patch_embeddings.view(1, -1, 1408) | |
cls_token = model.vision_model.embeddings.cls_token.expand(1, -1, -1) | |
dist_token = model.vision_model.embeddings.distillation_token.expand(1, -1, -1) | |
full_embeddings = torch.cat([cls_token, dist_token, patch_embeddings_flat], dim=1) | |
encoder_outputs = model.vision_model.encoder(full_embeddings) | |
image_outputs = encoder_outputs.last_hidden_state | |
image_outputs = F.adaptive_avg_pool2d(image_outputs, (3, 50176)) | |
image_outputs = image_outputs.view(1, 3, 224, 224) # Adjusted dimensions | |
return image_outputs | |
def generate_answer_blip2(image, question): | |
"""Generate answers based on an image and a question using a BLIP2 model.""" | |
image_outputs = preprocess_image(image) | |
# Prepare question | |
question_formatted = "Question: " + question + " Answer:" | |
inputs = processor(text=question_formatted, return_tensors="pt") | |
inputs['pixel_values'] = image_outputs.to(device) # Ensure image tensor is on the correct device | |
# Generate response using the model | |
generated_ids = model.generate(**inputs, max_length=50) | |
generated_answer = processor.batch_decode(generated_ids, skip_special_tokens=True) | |
return generated_answer[0] # Return the first (and typically only) generated answer | |
# Setting up the Gradio interface | |
iface = gr.Interface( | |
fn=generate_answer_blip2, | |
inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Enter your question")], | |
outputs=gr.Textbox(label="Generated Answer"), | |
title="Visual Question Answering with DeiT-BLIP2 Model", | |
description="Upload an image and type a related question to receive an answer generated by the model." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |