from llama_cpp import Llama import gradio as gr import time llm = Llama(model_path="zephyr-7B-beta-GGUF/zephyr-7b-beta.Q4_K_M.gguf") def predict(prompt,history): output = llm(prompt) response = output['choices'][0]['text'] for i in range(len(response)): time.sleep(0.05) yield response[:i+1] gr.ChatInterface(predict).queue().launch()