Spaces:

hackeracademy
/

foundation-sec-llm-api

Sleeping

foundation-sec-llm-api / app.py

Serve Foundation-Sec-8B-Q4_K_M directly from upstream repo

e82b7da about 1 month ago

1.75 kB

	import os, logging, requests, time
	from contextlib import asynccontextmanager
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from llama_cpp import Llama

	# Direct public download link
	MODEL_URL = (
	"https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
	"resolve/main/foundation-sec-8b-q4_k_m.gguf"
	)
	MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	logging.basicConfig(level=logging.INFO)

	# Download once; skip if already present
	if not os.path.exists(MODEL_PATH):
	logging.info("Downloading model … (~4.9 GB)")
	with requests.get(MODEL_URL, stream=True, timeout=30) as r:
	r.raise_for_status()
	with open(MODEL_PATH, "wb") as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	logging.info("Download finished.")

	logging.info("Loading model …")
	app.state.llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=4096,
	n_threads=os.cpu_count(),
	verbose=False
	)
	logging.info("Model ready.")
	yield
	logging.info("Shutting down.")

	app = FastAPI(lifespan=lifespan)

	class ChatRequest(BaseModel):
	messages: list[dict]
	max_tokens: int = 256
	temperature: float = 0.7

	@app.get("/")
	def root():
	return {"message": "Foundation-Sec-8B API running on HF Space"}

	@app.post("/v1/chat/completions")
	def chat(req: ChatRequest):
	try:
	return app.state.llm.create_chat_completion(
	messages=req.messages,
	max_tokens=req.max_tokens,
	temperature=req.temperature
	)
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))