import gradio as gr from llama_cpp import Llama llm = Llama( model_path="gemma-2b-uk.gguf", n_threads=2, n_threads_batch=2, ) def convert_history(message, history): chat_history = "" for block in history[-1:]: chat_history += f"<|user|>\n{block[0]}\n<|assistant|>\n{block[1]}\n" chat_history += f"<|user|>\n{message}\n<|assistant|>\n" return chat_history def ask(message, history): chat_history = convert_history(message, history) chunks = llm( chat_history, temperature = 0.2, top_p=0.9, stream = True, repeat_penalty = 1.05, max_tokens = 128, ) response = "" for chunk in chunks: delta = chunk["choices"][0]["text"] print(delta) response += delta yield response demo = gr.ChatInterface(ask) if __name__ == "__main__": demo.queue().launch()