|
from fastapi import FastAPI |
|
from pydantic import BaseModel |
|
from ctransformers import AutoModelForCausalLM |
|
import os |
|
|
|
|
|
MODEL_REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" |
|
MODEL_FILE = "mistral-7b-instruct-v0.1.Q4_K_M.gguf" |
|
|
|
|
|
os.makedirs(os.environ.get("HUGGINGFACE_HUB_CACHE", "/code/.cache/huggingface"), exist_ok=True) |
|
|
|
|
|
llm = AutoModelForCausalLM.from_pretrained( |
|
MODEL_REPO_ID, |
|
model_file=MODEL_FILE, |
|
model_type="mistral", |
|
gpu_layers=0, |
|
context_length=2048, |
|
) |
|
|
|
app = FastAPI(title="Mistral GGUF LLM API (ctransformers)", version="1.0.0") |
|
|
|
class InferenceRequest(BaseModel): |
|
prompt: str |
|
max_tokens: int = 256 |
|
|
|
class InferenceResponse(BaseModel): |
|
output: str |
|
|
|
@app.post("/infer", response_model=InferenceResponse) |
|
def infer(req: InferenceRequest): |
|
try: |
|
generated_text = llm(req.prompt, max_new_tokens=req.max_tokens) |
|
return InferenceResponse(output=str(generated_text).strip()) |
|
except Exception as e: |
|
return InferenceResponse(output=f"Error generating response: {str(e)}") |
|
|
|
@app.get("/") |
|
def health(): |
|
return {"status": "LLM is running. Visit /docs for Swagger UI"} |
|
|