import os, logging, requests, time from contextlib import asynccontextmanager from fastapi import FastAPI, HTTPException from pydantic import BaseModel from llama_cpp import Llama # Direct public download link MODEL_URL = ( "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/" "resolve/main/foundation-sec-8b-q4_k_m.gguf" ) MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf" @asynccontextmanager async def lifespan(app: FastAPI): logging.basicConfig(level=logging.INFO) # Download once; skip if already present if not os.path.exists(MODEL_PATH): logging.info("Downloading model … (~4.9 GB)") with requests.get(MODEL_URL, stream=True, timeout=30) as r: r.raise_for_status() with open(MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) logging.info("Download finished.") logging.info("Loading model …") app.state.llm = Llama( model_path=MODEL_PATH, n_ctx=4096, n_threads=os.cpu_count(), verbose=False ) logging.info("Model ready.") yield logging.info("Shutting down.") app = FastAPI(lifespan=lifespan) class ChatRequest(BaseModel): messages: list[dict] max_tokens: int = 256 temperature: float = 0.7 @app.get("/") def root(): return {"message": "Foundation-Sec-8B API running on HF Space"} @app.post("/v1/chat/completions") def chat(req: ChatRequest): try: return app.state.llm.create_chat_completion( messages=req.messages, max_tokens=req.max_tokens, temperature=req.temperature ) except Exception as e: raise HTTPException(status_code=500, detail=str(e))