iimran
/

Qwen2.5-3B-R1-MedicalReasoner

Text Generation

text-generation-inference

Model card Files Files and versions

Qwen2.5-3B-R1-MedicalReasoner / inference.py

iimran's picture

Create inference.py

d611803 verified 6 months ago

1.57 kB

	from unsloth import FastLanguageModel, is_bfloat16_supported
	from vllm import SamplingParams
	from huggingface_hub import snapshot_download
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner",
	load_in_4bit=True,
	fast_inference=True,
	gpu_memory_utilization=0.5
	)
	lora_rank = 64
	model = FastLanguageModel.get_peft_model(
	model,
	r=lora_rank,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_alpha=lora_rank,
	use_gradient_checkpointing="unsloth",
	random_state=3407,
	)
	lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter")
	print("LoRA adapter downloaded to:", lora_path)
	model.load_lora(lora_path)
	SYSTEM_PROMPT = (
	"Respond in the following format:\n"
	"<reasoning>\n"
	"...\n"
	"</reasoning>\n"
	"<answer>\n"
	"...\n"
	"</answer>"
	)
	USER_PROMPT = (
	"In the context of disseminated intravascular coagulation (DIC), "
	"which blood component is expected to show an increase due to the excessive breakdown of fibrin?"
	)
	text = tokenizer.apply_chat_template(
	[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": USER_PROMPT},
	],
	tokenize=False,
	add_generation_prompt=True
	)
	sampling_params = SamplingParams(
	temperature=0.1,
	top_p=0.95,
	max_tokens=4096,
	)
	outputs = model.fast_generate(
	text,
	sampling_params=sampling_params,
	lora_request=None
	)
	print(outputs[0].outputs[0].text)