| from unsloth import FastLanguageModel, is_bfloat16_supported | |
| from vllm import SamplingParams | |
| from huggingface_hub import snapshot_download | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner", | |
| load_in_4bit=True, | |
| fast_inference=True, | |
| gpu_memory_utilization=0.5 | |
| ) | |
| lora_rank = 64 | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r=lora_rank, | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha=lora_rank, | |
| use_gradient_checkpointing="unsloth", | |
| random_state=3407, | |
| ) | |
| lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter") | |
| print("LoRA adapter downloaded to:", lora_path) | |
| model.load_lora(lora_path) | |
| SYSTEM_PROMPT = ( | |
| "Respond in the following format:\n" | |
| "<reasoning>\n" | |
| "...\n" | |
| "</reasoning>\n" | |
| "<answer>\n" | |
| "...\n" | |
| "</answer>" | |
| ) | |
| USER_PROMPT = ( | |
| "In the context of disseminated intravascular coagulation (DIC), " | |
| "which blood component is expected to show an increase due to the excessive breakdown of fibrin?" | |
| ) | |
| text = tokenizer.apply_chat_template( | |
| [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": USER_PROMPT}, | |
| ], | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| sampling_params = SamplingParams( | |
| temperature=0.1, | |
| top_p=0.95, | |
| max_tokens=4096, | |
| ) | |
| outputs = model.fast_generate( | |
| text, | |
| sampling_params=sampling_params, | |
| lora_request=None | |
| ) | |
| print(outputs[0].outputs[0].text) | |