ThongCoding's picture
asd
1f23ef2
raw
history blame
1.62 kB
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
REPO_ID = "google/gemma-2b-it-GGUF"
FILENAME = "gemma-2b-it.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_DIR = "./models"
CACHE_DIR = "./models/.hf_cache"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# Make sure directories exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
if not os.path.exists(MODEL_PATH):
try:
print("📦 Downloading model from Hugging Face Hub...")
hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
token=HF_TOKEN,
cache_dir=CACHE_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False # even though deprecated, keep for compatibility
)
print(f"✅ Model downloaded to {MODEL_PATH}")
except Exception as e:
print(f"❌ Download failed: {e}")
raise
# Step 2: Load model using llama-cpp-python
print("🤖 Loading GGUF model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=512,
n_threads=4,
n_batch=512,
verbose=False
)
# Step 3: FastAPI app
app = FastAPI()
class PromptRequest(BaseModel):
prompt: str
@app.post("/prompt")
def generate_prompt(req: PromptRequest):
prompt = req.prompt.strip()
output = llm(
prompt,
max_tokens=512,
temperature=0.6,
top_p=0.95,
stop=["<|endoftext|>", "</s>", "```"],
echo=False
)
result = output["choices"][0]["text"].strip()
return {"response": result}