|
|
from __future__ import annotations |
|
|
from typing import Any, Dict, List, Optional, Union |
|
|
|
|
|
import torch |
|
|
import json |
|
|
from typing import Any |
|
|
|
|
|
from unsloth import FastLanguageModel |
|
|
from vllm import SamplingParams |
|
|
|
|
|
|
|
|
|
|
|
prompt_template = """Please answer the given financial question based on the context. |
|
|
**Context:** {context} |
|
|
**Question:** {question}""" |
|
|
|
|
|
|
|
|
class EndpointHandler: |
|
|
""" |
|
|
Custom handler for HF Inference Endpoints. |
|
|
Loads a PEFT LoRA adapter on a 4-bit base model and performs text generation. |
|
|
""" |
|
|
|
|
|
def __init__(self, path: str): |
|
|
""" |
|
|
`path` points to the repo directory mounted by the service. |
|
|
We load tokenizer from `path` (this repo) and the PEFT model via AutoPeft using `path`. |
|
|
AutoPeft reads adapter_config.json to find the base model. |
|
|
""" |
|
|
self.sampling_params = SamplingParams( |
|
|
temperature=0.7, |
|
|
top_p=0.95, |
|
|
top_k=20, |
|
|
max_tokens=7 * 1024, |
|
|
) |
|
|
|
|
|
|
|
|
model, self.tokenizer = FastLanguageModel.from_pretrained( |
|
|
model_name=path, |
|
|
max_seq_length=8192, |
|
|
load_in_4bit=True, |
|
|
fast_inference=True, |
|
|
max_lora_rank=128, |
|
|
gpu_memory_utilization=0.5, |
|
|
full_finetuning=False, |
|
|
) |
|
|
|
|
|
self.model = FastLanguageModel.get_peft_model( |
|
|
model, |
|
|
r=128, |
|
|
target_modules=[ |
|
|
"q_proj", |
|
|
"k_proj", |
|
|
"v_proj", |
|
|
"o_proj", |
|
|
"gate_proj", |
|
|
"up_proj", |
|
|
"down_proj", |
|
|
], |
|
|
lora_alpha=128 * 2, |
|
|
use_gradient_checkpointing="unsloth", |
|
|
random_state=3407, |
|
|
use_rslora=True, |
|
|
loftq_config=None |
|
|
) |
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]: |
|
|
""" |
|
|
Request format: |
|
|
{ |
|
|
"inputs": "optional raw prompt string", |
|
|
"messages": [{"role": "system/user/assistant", "content": "..."}], # optional |
|
|
"parameters": { ... generation overrides ... } |
|
|
} |
|
|
|
|
|
Returns: |
|
|
[ { "generated_text": "<model reply>" } ] |
|
|
""" |
|
|
text = self.tokenizer.apply_chat_template( |
|
|
data["inputs"], |
|
|
tokenize=False, |
|
|
add_generation_prompt=True, |
|
|
enable_thinking=True, |
|
|
) |
|
|
|
|
|
output = ( |
|
|
self.model.fast_generate( |
|
|
[text], sampling_params=self.sampling_params, lora_request=None, use_tqdm=False |
|
|
)[0] |
|
|
.outputs[0] |
|
|
.text |
|
|
) |
|
|
|
|
|
return output |
|
|
|