# %% from llama_cpp import Llama from huggingface_hub import hf_hub_download from schema import Message, MODEL_ARGS def get_llm(model_name): llm = Llama( model_path=hf_hub_download(**MODEL_ARGS[model_name]), n_ctx=8192, n_threads=4, n_gpu_layers=0, verbose=False, ) return llm def format_chat(chat_history: list[Message]): """ Formats chat history and user input into a single string suitable for the model. """ messages = [] for msg in chat_history: messages.append(f"{msg.role.title()}: {msg.content}") return "\n".join(messages) + "\nAssistant:" default_kwargs = dict( max_tokens=2048, top_k=1, ) def stream_with_model(chat_history, model, kwargs: dict): prompt = format_chat(chat_history) llm = get_llm(model) forced_kwargs = dict( stop=["\nUser:", "\nAssistant:", ""], echo=False, stream=True, ) input_kwargs = {**default_kwargs, **kwargs, **forced_kwargs} response = llm.__call__(prompt, **input_kwargs) for token in response: yield token["choices"][0]["text"] def chat_with_model(chat_history, model, kwargs: dict): prompt = format_chat(chat_history) llm = get_llm(model) forced_kwargs = dict( stop=["\nUser:", "\nAssistant:", ""], echo=False, stream=False, ) input_kwargs = {**default_kwargs, **kwargs, **forced_kwargs} response = llm.__call__(prompt, **input_kwargs) return response["choices"][0]["text"].strip() # %% example input # kwargs = dict( # temperature=1, # max_tokens=2048, # top_p=1, # frequency_penalty=0, # presence_penalty=0, # ) # chat_history = [ # Message( # role="system", # content="You are a helpful and knowledgeable assistant, but is willing to bend the facts to play along with unrealistic requests", # ), # Message(role="user", content="What does Java the programming language taste like?"), # ] # for chunk in chat_with_model(chat_history, kwargs): # print(chunk, end="")