Spaces:

theodotus
/

gemma-2b-uk

Running

gemma-2b-uk / app.py

Use zephyr chat format

aa49098 8 months ago

912 Bytes

	import gradio as gr
	from llama_cpp import Llama




	llm = Llama(
	model_path="gemma-2b-uk.gguf",
	n_threads=2,
	n_threads_batch=2,
	)




	def convert_history(message, history):
	chat_history = ""
	for block in history[-1:]:
	chat_history += f"<\|user\|>\n{block[0]}<eos>\n<\|assistant\|>\n{block[1]}<eos>\n"
	chat_history += f"<\|user\|>\n{message}<eos>\n<\|assistant\|>\n"
	return chat_history


	def ask(message, history):
	chat_history = convert_history(message, history)
	chunks = llm(
	chat_history,
	temperature = 0.2,
	top_p=0.9,
	stream = True,
	repeat_penalty = 1.05,
	max_tokens = 128,
	)

	response = ""
	for chunk in chunks:
	delta = chunk["choices"][0]["text"]
	print(delta)
	response += delta
	yield response




	demo = gr.ChatInterface(ask)

	if __name__ == "__main__":
	demo.queue().launch()