Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
from qwen_vl_utils import process_vision_info | |
import torch | |
from PIL import Image | |
import numpy as np | |
from datetime import datetime | |
import os | |
# Function to convert numpy array to image and save it | |
def array_to_image_path(image_array): | |
img = Image.fromarray(np.uint8(image_array)) | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
filename = f"image_{timestamp}.png" | |
img.save(filename) | |
full_path = os.path.abspath(filename) | |
return full_path | |
# Model and processor initialization | |
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True, torch_dtype="auto").eval() | |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True) | |
user_prompt = '<|user|>\n' | |
assistant_prompt = '<|assistant|>\n' | |
prompt_suffix = "<|end|>\n" | |
# Streamlit App | |
st.title("Qwen2-VL-2B Demo") | |
# Upload image | |
uploaded_file = st.file_uploader("Upload an image...", type=["jpg", "png"]) | |
if uploaded_file is not None: | |
image = Image.open(uploaded_file).convert("RGB") | |
st.image(image, caption="Uploaded Image", use_column_width=True) | |
# User input | |
text_input = st.text_input("Enter your question:") | |
if st.button("Generate"): | |
image_path = array_to_image_path(np.array(image)) | |
prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}" | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image", "image": image_path}, | |
{"type": "text", "text": text_input}, | |
], | |
} | |
] | |
# Prepare inputs for inference | |
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
image_inputs, video_inputs = process_vision_info(messages) | |
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") | |
inputs = inputs.to("cpu") | |
# Model inference | |
generated_ids = model.generate(**inputs, max_new_tokens=128) | |
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] | |
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) | |
# Display the generated output | |
st.write("Generated Response:", output_text[0]) | |