import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama import psutil ## Download the GGUF model model_name = "jackangel/LLama_3_Instruct_SPFx_Docs_Unsloth" model_file = "Llama_3_Instruct_SPFx_Docs_Unsloth.Q4_K_M.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred model_path = hf_hub_download(model_name, filename=model_file) _ = psutil.cpu_count(logical=False) - 1 cpu_count: int = int(_) if _ else 1 ## Instantiate model from downloaded file llm = Llama( model_path=model_path, n_ctx=1024, # Context length to use n_threads=cpu_count, # Number of CPU threads to use n_gpu_layers=0 # Number of model layers to offload to GPU ) ## Generation kwargs generation_kwargs = { "max_tokens":1000, "stop":[""], "echo":False, # Echo the prompt in the output "top_k":1 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding } def chatbot(message, history): prompt = "INSTRUCTION: You are a helpful assistant\nINPUT: " + message + "\nOUTPUT:" airemember = "" for human,assistant in history: airemember += "USER: " + human + "\nASSISTANT:" + assistant+"\n\n" sendtoai = airemember + prompt result = llm(sendtoai, **generation_kwargs) return result app = gr.ChatInterface(chatbot)