mimic-svm / code /base-inference.py
ahmad4raza's picture
Upload folder using huggingface_hub
097b6c6 verified
import os
import time
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import torch
from PIL import Image
from unsloth import FastVisionModel
from transformers import AutoProcessor
CHECKPOINT_PATH = "outputs/mimic_qwen3vl_lora_8bit_5/checkpoint-17454"
BASE_MODEL_NAME = "unsloth/Qwen3-VL-8B-Thinking"
SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt.txt")
IMAGE_PATH_1 = Path(
"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg"
)
IMAGE_PATH_2 = Path(
"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg"
)
SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip()
INPUT_IMAGE_1 = Image.open(IMAGE_PATH_1).convert("RGB")
INPUT_IMAGE_2 = Image.open(IMAGE_PATH_2).convert("RGB")
model, _ = FastVisionModel.from_pretrained(
model_name=CHECKPOINT_PATH,
load_in_4bit=False,
load_in_8bit=True,
)
processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
FastVisionModel.for_inference(model)
messages = [
# {
# "role": "system",
# "content": [{"type": "text", "text": SYSTEM_PROMPT}],
# },
{
"role": "user",
"content": [
{"type": "image", "image": INPUT_IMAGE_1},
{"type": "image", "image": INPUT_IMAGE_2},
{"type": "text", "text": SYSTEM_PROMPT}
]
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
device = next(model.parameters()).device
inputs = inputs.to(device)
if torch.cuda.is_available():
torch.cuda.synchronize()
start_time = time.perf_counter()
outputs = model.generate(**inputs, max_new_tokens=4096)
if torch.cuda.is_available():
torch.cuda.synchronize()
gen_time_seconds = time.perf_counter() - start_time
if "attention_mask" in inputs:
input_tokens = int(inputs["attention_mask"][0].sum().item())
else:
input_tokens = int(inputs["input_ids"].shape[-1])
total_tokens = int(outputs.shape[-1])
output_tokens = total_tokens - input_tokens
generated_text = processor.decode(outputs[0][input_tokens:])
print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {total_tokens}")
print(f"Generation time (s): {gen_time_seconds:.3f}")
print("\n--- Raw Output ---")
print(generated_text)