Spaces:

Yapp99
/

LLM_endpoint

Sleeping

LLM_endpoint / llm_backend.py

init commit

f8813f2 27 days ago

1.7 kB

	# %%
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	from schema import Message, MODEL_ARGS


	def get_llm(model_name):
	llm = Llama(
	model_path=hf_hub_download(**MODEL_ARGS[model_name]),
	n_ctx=8192,
	n_threads=4,
	n_gpu_layers=0,
	verbose=False,
	)

	return llm


	def format_chat(chat_history: list[Message]):
	"""
	Formats chat history and user input into a single string
	suitable for the model.
	"""
	messages = []
	for msg in chat_history:
	messages.append(f"{msg.role.title()}: {msg.content}")

	return "\n".join(messages) + "\nAssistant:"


	def chat_with_model(chat_history, model, kwargs: dict):
	prompt = format_chat(chat_history)

	default_kwargs = dict(
	max_tokens=2048,
	top_k=1,
	)

	forced_kwargs = dict(
	stop=["\nUser:", "\nAssistant:", "</s>"],
	echo=False,
	stream=True,
	)

	llm = get_llm(model)

	input_kwargs = {default_kwargs, kwargs, **forced_kwargs}
	response = llm.__call__(prompt, **input_kwargs)

	for token in response:
	yield token["choices"][0]["text"]


	# %% example input
	# kwargs = dict(
	# temperature=1,
	# max_tokens=2048,
	# top_p=1,
	# frequency_penalty=0,
	# presence_penalty=0,
	# )

	# chat_history = [
	# Message(
	# role="system",
	# content="You are a helpful and knowledgeable assistant, but is willing to bend the facts to play along with unrealistic requests",
	# ),
	# Message(role="user", content="What does Java the programming language taste like?"),
	# ]


	# for chunk in chat_with_model(chat_history, kwargs):
	# print(chunk, end="")