hackeracademy's picture
Serve Foundation-Sec-8B-Q4_K_M directly from upstream repo
e82b7da
raw
history blame
1.75 kB
import os, logging, requests, time
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
# Direct public download link
MODEL_URL = (
"https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
"resolve/main/foundation-sec-8b-q4_k_m.gguf"
)
MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
@asynccontextmanager
async def lifespan(app: FastAPI):
logging.basicConfig(level=logging.INFO)
# Download once; skip if already present
if not os.path.exists(MODEL_PATH):
logging.info("Downloading model … (~4.9 GB)")
with requests.get(MODEL_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logging.info("Download finished.")
logging.info("Loading model …")
app.state.llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_threads=os.cpu_count(),
verbose=False
)
logging.info("Model ready.")
yield
logging.info("Shutting down.")
app = FastAPI(lifespan=lifespan)
class ChatRequest(BaseModel):
messages: list[dict]
max_tokens: int = 256
temperature: float = 0.7
@app.get("/")
def root():
return {"message": "Foundation-Sec-8B API running on HF Space"}
@app.post("/v1/chat/completions")
def chat(req: ChatRequest):
try:
return app.state.llm.create_chat_completion(
messages=req.messages,
max_tokens=req.max_tokens,
temperature=req.temperature
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))