from fastapi import FastAPI from pydantic import BaseModel from ctransformers import AutoModelForCausalLM import os # Model configuration for ctransformers (CPU-friendly) MODEL_REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" MODEL_FILE = "mistral-7b-instruct-v0.1.Q4_K_M.gguf" # Ensure cache dir is writable in container os.makedirs(os.environ.get("HUGGINGFACE_HUB_CACHE", "/code/.cache/huggingface"), exist_ok=True) # Load the model once at startup llm = AutoModelForCausalLM.from_pretrained( MODEL_REPO_ID, model_file=MODEL_FILE, model_type="mistral", gpu_layers=0, context_length=2048, ) app = FastAPI(title="Mistral GGUF LLM API (ctransformers)", version="1.0.0") class InferenceRequest(BaseModel): prompt: str max_tokens: int = 256 class InferenceResponse(BaseModel): output: str @app.post("/infer", response_model=InferenceResponse) def infer(req: InferenceRequest): try: generated_text = llm(req.prompt, max_new_tokens=req.max_tokens) return InferenceResponse(output=str(generated_text).strip()) except Exception as e: return InferenceResponse(output=f"Error generating response: {str(e)}") @app.get("/") def health(): return {"status": "LLM is running. Visit /docs for Swagger UI"}