File size: 1,750 Bytes
e82b7da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import os, logging, requests, time
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
# Direct public download link
MODEL_URL = (
"https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
"resolve/main/foundation-sec-8b-q4_k_m.gguf"
)
MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
@asynccontextmanager
async def lifespan(app: FastAPI):
logging.basicConfig(level=logging.INFO)
# Download once; skip if already present
if not os.path.exists(MODEL_PATH):
logging.info("Downloading model … (~4.9 GB)")
with requests.get(MODEL_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logging.info("Download finished.")
logging.info("Loading model …")
app.state.llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_threads=os.cpu_count(),
verbose=False
)
logging.info("Model ready.")
yield
logging.info("Shutting down.")
app = FastAPI(lifespan=lifespan)
class ChatRequest(BaseModel):
messages: list[dict]
max_tokens: int = 256
temperature: float = 0.7
@app.get("/")
def root():
return {"message": "Foundation-Sec-8B API running on HF Space"}
@app.post("/v1/chat/completions")
def chat(req: ChatRequest):
try:
return app.state.llm.create_chat_completion(
messages=req.messages,
max_tokens=req.max_tokens,
temperature=req.temperature
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|