minivlm / example.py
yjj23's picture
Initial model upload
d04461f verified
from vlm_model import VLMConfig, VLM
from transformers import AutoProcessor, AutoTokenizer
from PIL import Image
import torch
# Load model and tokenizers
config = VLMConfig.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip")
model = VLM.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip")
tokenizer = AutoTokenizer.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
# Load image
image = Image.open("your_image.jpg").convert("RGB")
processor_output = processor(text=None, images=image, return_tensors="pt")
pixel_values = processor_output['pixel_values']
# Create input with image placeholder
chat = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"What's in this image?{('<|image_pad|>' * config.image_pad_num)}"}
]
input_text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
# Generate response
with torch.no_grad():
generated_ids = model.generate(
input_ids=input_ids,
pixel_values=pixel_values,
max_new_tokens=200,
do_sample=True,
temperature=0.7,
top_p=0.9,
)
# Decode response
response = tokenizer.decode(generated_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
print(response)