from llama_cpp import Llama | |
import gradio as gr | |
import time | |
llm = Llama(model_path="zephyr-7B-beta-GGUF/zephyr-7b-beta.Q4_K_M.gguf") | |
def predict(prompt,history): | |
output = llm(prompt) | |
response = output['choices'][0]['text'] | |
for i in range(len(response)): | |
time.sleep(0.05) | |
yield response[:i+1] | |
gr.ChatInterface(predict).queue().launch() | |