|
|
import gradio as gr |
|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
|
|
|
MODEL_REPO = "Kezovic/iris-f16gguf-test" |
|
|
MODEL_FILE = "llama-3.2-1b-instruct.F16.gguf" |
|
|
|
|
|
CONTEXT_WINDOW = 4096 |
|
|
MAX_NEW_TOKENS = 512 |
|
|
TEMPERATURE = 1.5 |
|
|
|
|
|
|
|
|
def load_llm(): |
|
|
"""Downloads the GGUF model and initializes LlamaCPP.""" |
|
|
print("Downloading model...") |
|
|
model_path = hf_hub_download( |
|
|
repo_id=MODEL_REPO, |
|
|
filename=MODEL_FILE |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=CONTEXT_WINDOW, |
|
|
n_threads=2, |
|
|
verbose=False, |
|
|
min_p=0.1 |
|
|
) |
|
|
print("Model loaded successfully!") |
|
|
return llm |
|
|
|
|
|
|
|
|
llm = load_llm() |
|
|
|
|
|
|
|
|
def generate(prompt, history): |
|
|
"""Generates a response using the Llama model.""" |
|
|
|
|
|
full_prompt = f"### Human: {prompt}\n### Assistant:" |
|
|
|
|
|
output = llm( |
|
|
prompt=full_prompt, |
|
|
max_tokens=MAX_NEW_TOKENS, |
|
|
temperature=TEMPERATURE, |
|
|
stop=["### Human:"], |
|
|
echo=False |
|
|
) |
|
|
|
|
|
|
|
|
response_text = output['choices'][0]['text'].strip() |
|
|
return response_text |
|
|
|
|
|
|
|
|
|
|
|
gr.ChatInterface( |
|
|
generate, |
|
|
title=f"Chat with {MODEL_FILE}", |
|
|
description="A GGUF LLM hosted on Hugging Face CPU Space using llama-cpp-python." |
|
|
).launch() |