|
from unsloth import FastLanguageModel, is_bfloat16_supported |
|
from vllm import SamplingParams |
|
from huggingface_hub import snapshot_download |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner", |
|
load_in_4bit=True, |
|
fast_inference=True, |
|
gpu_memory_utilization=0.5 |
|
) |
|
lora_rank = 64 |
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=lora_rank, |
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj"], |
|
lora_alpha=lora_rank, |
|
use_gradient_checkpointing="unsloth", |
|
random_state=3407, |
|
) |
|
lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter") |
|
print("LoRA adapter downloaded to:", lora_path) |
|
model.load_lora(lora_path) |
|
SYSTEM_PROMPT = ( |
|
"Respond in the following format:\n" |
|
"<reasoning>\n" |
|
"...\n" |
|
"</reasoning>\n" |
|
"<answer>\n" |
|
"...\n" |
|
"</answer>" |
|
) |
|
USER_PROMPT = ( |
|
"In the context of disseminated intravascular coagulation (DIC), " |
|
"which blood component is expected to show an increase due to the excessive breakdown of fibrin?" |
|
) |
|
text = tokenizer.apply_chat_template( |
|
[ |
|
{"role": "system", "content": SYSTEM_PROMPT}, |
|
{"role": "user", "content": USER_PROMPT}, |
|
], |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
sampling_params = SamplingParams( |
|
temperature=0.1, |
|
top_p=0.95, |
|
max_tokens=4096, |
|
) |
|
outputs = model.fast_generate( |
|
text, |
|
sampling_params=sampling_params, |
|
lora_request=None |
|
) |
|
print(outputs[0].outputs[0].text) |
|
|