Spaces:
Sleeping
Sleeping
# %% | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
from schema import Message, MODEL_ARGS | |
def get_llm(model_name): | |
llm = Llama( | |
model_path=hf_hub_download(**MODEL_ARGS[model_name]), | |
n_ctx=8192, | |
n_threads=4, | |
n_gpu_layers=0, | |
verbose=False, | |
) | |
return llm | |
def format_chat(chat_history: list[Message]): | |
""" | |
Formats chat history and user input into a single string | |
suitable for the model. | |
""" | |
messages = [] | |
for msg in chat_history: | |
messages.append(f"{msg.role.title()}: {msg.content}") | |
return "\n".join(messages) + "\nAssistant:" | |
def chat_with_model(chat_history, model, kwargs: dict): | |
prompt = format_chat(chat_history) | |
default_kwargs = dict( | |
max_tokens=2048, | |
top_k=1, | |
) | |
forced_kwargs = dict( | |
stop=["\nUser:", "\nAssistant:", "</s>"], | |
echo=False, | |
stream=True, | |
) | |
llm = get_llm(model) | |
input_kwargs = {**default_kwargs, **kwargs, **forced_kwargs} | |
response = llm.__call__(prompt, **input_kwargs) | |
for token in response: | |
yield token["choices"][0]["text"] | |
# %% example input | |
# kwargs = dict( | |
# temperature=1, | |
# max_tokens=2048, | |
# top_p=1, | |
# frequency_penalty=0, | |
# presence_penalty=0, | |
# ) | |
# chat_history = [ | |
# Message( | |
# role="system", | |
# content="You are a helpful and knowledgeable assistant, but is willing to bend the facts to play along with unrealistic requests", | |
# ), | |
# Message(role="user", content="What does Java the programming language taste like?"), | |
# ] | |
# for chunk in chat_with_model(chat_history, kwargs): | |
# print(chunk, end="") | |