|
from __future__ import annotations |
|
|
|
import json |
|
import os |
|
from llama_cpp import Llama |
|
|
|
from ..index_func import * |
|
from ..presets import * |
|
from ..utils import * |
|
from .base_model import BaseLLMModel, download |
|
|
|
SYS_PREFIX = "<<SYS>>\n" |
|
SYS_POSTFIX = "\n<</SYS>>\n\n" |
|
INST_PREFIX = "<s>[INST] " |
|
INST_POSTFIX = " " |
|
OUTPUT_PREFIX = "[/INST] " |
|
OUTPUT_POSTFIX = "</s>" |
|
|
|
|
|
class LLaMA_Client(BaseLLMModel): |
|
def __init__(self, model_name, lora_path=None, user_name="") -> None: |
|
super().__init__(model_name=model_name, user=user_name) |
|
|
|
self.max_generation_token = 1000 |
|
if model_name in MODEL_METADATA: |
|
path_to_model = download( |
|
MODEL_METADATA[model_name]["repo_id"], |
|
MODEL_METADATA[model_name]["filelist"][0], |
|
) |
|
else: |
|
dir_to_model = os.path.join("models", model_name) |
|
|
|
path_to_model = None |
|
for root, dirs, files in os.walk(dir_to_model): |
|
for file in files: |
|
if file.endswith(".gguf"): |
|
path_to_model = os.path.join(root, file) |
|
break |
|
if path_to_model is not None: |
|
break |
|
self.system_prompt = "" |
|
|
|
if lora_path is not None: |
|
lora_path = os.path.join("lora", lora_path) |
|
self.model = Llama(model_path=path_to_model, lora_path=lora_path) |
|
else: |
|
self.model = Llama(model_path=path_to_model) |
|
|
|
def _get_llama_style_input(self): |
|
context = [] |
|
for conv in self.history: |
|
if conv["role"] == "system": |
|
context.append(SYS_PREFIX + conv["content"] + SYS_POSTFIX) |
|
elif conv["role"] == "user": |
|
context.append( |
|
INST_PREFIX + conv["content"] + INST_POSTFIX + OUTPUT_PREFIX |
|
) |
|
else: |
|
context.append(conv["content"] + OUTPUT_POSTFIX) |
|
return "".join(context) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_answer_at_once(self): |
|
context = self._get_llama_style_input() |
|
response = self.model( |
|
context, |
|
max_tokens=self.max_generation_token, |
|
stop=[], |
|
echo=False, |
|
stream=False, |
|
) |
|
return response, len(response) |
|
|
|
def get_answer_stream_iter(self): |
|
context = self._get_llama_style_input() |
|
iter = self.model( |
|
context, |
|
max_tokens=self.max_generation_token, |
|
stop=[SYS_PREFIX, SYS_POSTFIX, INST_PREFIX, OUTPUT_PREFIX, OUTPUT_POSTFIX], |
|
echo=False, |
|
stream=True, |
|
) |
|
partial_text = "" |
|
for i in iter: |
|
response = i["choices"][0]["text"] |
|
partial_text += response |
|
yield partial_text |
|
|