Spaces:
Running
Running
import gradio as gr | |
from llama_cpp import Llama | |
llm = Llama( | |
model_path="gemma-2b-uk.gguf", | |
n_threads=2, | |
n_threads_batch=2, | |
) | |
def convert_history(message, history): | |
chat_history = "" | |
for block in history[-1:]: | |
chat_history += f"<|user|>\n{block[0]}<eos>\n<|assistant|>\n{block[1]}<eos>\n" | |
chat_history += f"<|user|>\n{message}<eos>\n<|assistant|>\n" | |
return chat_history | |
def ask(message, history): | |
chat_history = convert_history(message, history) | |
chunks = llm( | |
chat_history, | |
temperature = 0.2, | |
top_p=0.9, | |
stream = True, | |
repeat_penalty = 1.05, | |
max_tokens = 128, | |
) | |
response = "" | |
for chunk in chunks: | |
delta = chunk["choices"][0]["text"] | |
print(delta) | |
response += delta | |
yield response | |
demo = gr.ChatInterface(ask) | |
if __name__ == "__main__": | |
demo.queue().launch() |