Docker-JD / app.py
ariji1's picture
Update app.py
816e3ae verified
from fastapi import FastAPI
from pydantic import BaseModel
from ctransformers import AutoModelForCausalLM
import os
# Model configuration for ctransformers (CPU-friendly)
MODEL_REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
MODEL_FILE = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
# Ensure cache dir is writable in container
os.makedirs(os.environ.get("HUGGINGFACE_HUB_CACHE", "/code/.cache/huggingface"), exist_ok=True)
# Load the model once at startup
llm = AutoModelForCausalLM.from_pretrained(
MODEL_REPO_ID,
model_file=MODEL_FILE,
model_type="mistral",
gpu_layers=0,
context_length=2048,
)
app = FastAPI(title="Mistral GGUF LLM API (ctransformers)", version="1.0.0")
class InferenceRequest(BaseModel):
prompt: str
max_tokens: int = 256
class InferenceResponse(BaseModel):
output: str
@app.post("/infer", response_model=InferenceResponse)
def infer(req: InferenceRequest):
try:
generated_text = llm(req.prompt, max_new_tokens=req.max_tokens)
return InferenceResponse(output=str(generated_text).strip())
except Exception as e:
return InferenceResponse(output=f"Error generating response: {str(e)}")
@app.get("/")
def health():
return {"status": "LLM is running. Visit /docs for Swagger UI"}