gemma-2b-uk / app.py
theodotus's picture
Use zephyr chat format
aa49098
raw
history blame
912 Bytes
import gradio as gr
from llama_cpp import Llama
llm = Llama(
model_path="gemma-2b-uk.gguf",
n_threads=2,
n_threads_batch=2,
)
def convert_history(message, history):
chat_history = ""
for block in history[-1:]:
chat_history += f"<|user|>\n{block[0]}<eos>\n<|assistant|>\n{block[1]}<eos>\n"
chat_history += f"<|user|>\n{message}<eos>\n<|assistant|>\n"
return chat_history
def ask(message, history):
chat_history = convert_history(message, history)
chunks = llm(
chat_history,
temperature = 0.2,
top_p=0.9,
stream = True,
repeat_penalty = 1.05,
max_tokens = 128,
)
response = ""
for chunk in chunks:
delta = chunk["choices"][0]["text"]
print(delta)
response += delta
yield response
demo = gr.ChatInterface(ask)
if __name__ == "__main__":
demo.queue().launch()