Spaces:
Sleeping
Sleeping
File size: 4,234 Bytes
2461df4 9050382 d8ca139 9050382 041b756 e6174de 93eb2df 9050382 87729ac bf77b49 87729ac 342f42e b85d146 bf77b49 342f42e 87729ac 342f42e 87729ac a456fab 87729ac a456fab 87729ac a456fab bf77b49 2461df4 b85d146 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import torch
import torch.nn.functional as F
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
from peft import LoraConfig, get_peft_model
# Initialize the processor and model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
# model_path = "full-blip2-deit-config-yes-no-2.pth"
# model = torch.load("./full-blip2-deit-config-2.pth")
# model = torch.load("./full-blip2-deit.pth") # not working - error
# model = torch.load("./full-blip2-deit-config-free-form-4-ver-2.pth")
model = torch.load("./full_config_blip2-deit-05")
model.eval() # Set the model to evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def preprocess_image(image):
"""Preprocess the image to match the model's input requirements."""
# Convert PIL image to tensor
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
# Apply specific model's preprocessing
patch_embeddings = model.vision_model.embeddings.patch_embeddings.projection(pixel_values)
patch_embeddings_flat = patch_embeddings.view(1, -1, 1408)
cls_token = model.vision_model.embeddings.cls_token.expand(1, -1, -1)
dist_token = model.vision_model.embeddings.distillation_token.expand(1, -1, -1)
full_embeddings = torch.cat([cls_token, dist_token, patch_embeddings_flat], dim=1)
encoder_outputs = model.vision_model.encoder(full_embeddings)
image_outputs = encoder_outputs.last_hidden_state
image_outputs = F.adaptive_avg_pool2d(image_outputs, (3, 50176))
image_outputs = image_outputs.view(1, 3, 224, 224) # Adjusted dimensions
return image_outputs
def generate_answer_blip2(image, question):
"""Generate answers based on an image and a question using a BLIP2 model."""
image_outputs = preprocess_image(image)
# Prepare question
question_formatted = "Question: " + question + " Answer:"
inputs = processor(text=question_formatted, return_tensors="pt")
inputs['pixel_values'] = image_outputs.to(device) # Ensure image tensor is on the correct device
# Generate response using the model
generated_ids = model.generate(**inputs, max_length=50)
generated_answer = processor.batch_decode(generated_ids, skip_special_tokens=True)
return generated_answer[0] # Return the first (and typically only) generated answer
# Function to display the demo interface
def show_demo():
return (
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True)
)
# Setting up the Gradio interface with Blocks
with gr.Blocks() as landing_page:
gr.Markdown("# Welcome to the Visual Question Answering Demo")
gr.Markdown("This demo uses the customized BLIP2 model to answer questions about images.")
gr.Markdown("### How to Use: ")
gr.Markdown("1. Upload an image. \n2. Enter a question related to the image. \n3. Receive the generated answer.")
gr.Markdown("### Model Information: ")
gr.Markdown("The BLIP2 model combines vision and language understanding to generate answers based on the provided image and question.")
with gr.Column() as demo_column:
start_demo_button = gr.Button("Start Demo")
image_input = gr.Image(label="Upload Image", visible=False)
question_input = gr.Textbox(label="Enter your question", visible=False)
submit_button = gr.Button("Submit", visible=False)
clear_button = gr.Button("Clear", visible=False)
answer_output = gr.Textbox(label="Generated Answer", visible=False)
start_demo_button.click(fn=show_demo, inputs=None, outputs=[image_input, question_input, submit_button, clear_button, answer_output])
def generate_and_show_answer(image, question):
return generate_answer_blip2(image, question)
submit_button.click(fn=generate_and_show_answer, inputs=[image_input, question_input], outputs=answer_output)
clear_button.click(fn=lambda: (None, "", "", ""), inputs=None, outputs=[image_input, question_input, answer_output, answer_output])
if __name__ == "__main__":
landing_page.launch()
|