ahmad4raza
/

mimic-svm

Model card Files Files and versions

mimic-svm / code /base-inference.py

ahmad4raza's picture

Upload folder using huggingface_hub

097b6c6 verified about 1 month ago

history blame contribute delete

2.46 kB

	import os
	import time
	from pathlib import Path

	os.environ["CUDA_VISIBLE_DEVICES"] = "7"

	import torch
	from PIL import Image
	from unsloth import FastVisionModel
	from transformers import AutoProcessor

	CHECKPOINT_PATH = "outputs/mimic_qwen3vl_lora_8bit_5/checkpoint-17454"
	BASE_MODEL_NAME = "unsloth/Qwen3-VL-8B-Thinking"
	SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt.txt")
	IMAGE_PATH_1 = Path(
	"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg"
	)
	IMAGE_PATH_2 = Path(
	"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg"
	)

	SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip()
	INPUT_IMAGE_1 = Image.open(IMAGE_PATH_1).convert("RGB")
	INPUT_IMAGE_2 = Image.open(IMAGE_PATH_2).convert("RGB")

	model, _ = FastVisionModel.from_pretrained(
	model_name=CHECKPOINT_PATH,
	load_in_4bit=False,
	load_in_8bit=True,
	)
	processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
	FastVisionModel.for_inference(model)
	messages = [
	# {
	# "role": "system",
	# "content": [{"type": "text", "text": SYSTEM_PROMPT}],
	# },
	{
	"role": "user",
	"content": [
	{"type": "image", "image": INPUT_IMAGE_1},
	{"type": "image", "image": INPUT_IMAGE_2},
	{"type": "text", "text": SYSTEM_PROMPT}
	]
	},
	]
	inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	)

	device = next(model.parameters()).device
	inputs = inputs.to(device)

	if torch.cuda.is_available():
	torch.cuda.synchronize()
	start_time = time.perf_counter()
	outputs = model.generate(**inputs, max_new_tokens=4096)
	if torch.cuda.is_available():
	torch.cuda.synchronize()
	gen_time_seconds = time.perf_counter() - start_time

	if "attention_mask" in inputs:
	input_tokens = int(inputs["attention_mask"][0].sum().item())
	else:
	input_tokens = int(inputs["input_ids"].shape[-1])

	total_tokens = int(outputs.shape[-1])
	output_tokens = total_tokens - input_tokens

	generated_text = processor.decode(outputs[0][input_tokens:])

	print(f"Input tokens: {input_tokens}")
	print(f"Output tokens: {output_tokens}")
	print(f"Total tokens: {total_tokens}")
	print(f"Generation time (s): {gen_time_seconds:.3f}")
	print("\n--- Raw Output ---")
	print(generated_text)