iimran's picture
Create inference.py
d611803 verified
raw
history blame
1.57 kB
from unsloth import FastLanguageModel, is_bfloat16_supported
from vllm import SamplingParams
from huggingface_hub import snapshot_download
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner",
load_in_4bit=True,
fast_inference=True,
gpu_memory_utilization=0.5
)
lora_rank = 64
model = FastLanguageModel.get_peft_model(
model,
r=lora_rank,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha=lora_rank,
use_gradient_checkpointing="unsloth",
random_state=3407,
)
lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter")
print("LoRA adapter downloaded to:", lora_path)
model.load_lora(lora_path)
SYSTEM_PROMPT = (
"Respond in the following format:\n"
"<reasoning>\n"
"...\n"
"</reasoning>\n"
"<answer>\n"
"...\n"
"</answer>"
)
USER_PROMPT = (
"In the context of disseminated intravascular coagulation (DIC), "
"which blood component is expected to show an increase due to the excessive breakdown of fibrin?"
)
text = tokenizer.apply_chat_template(
[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": USER_PROMPT},
],
tokenize=False,
add_generation_prompt=True
)
sampling_params = SamplingParams(
temperature=0.1,
top_p=0.95,
max_tokens=4096,
)
outputs = model.fast_generate(
text,
sampling_params=sampling_params,
lora_request=None
)
print(outputs[0].outputs[0].text)