llm-apiku / app.py
Dnfs's picture
Update app.py
1cc8acd verified
raw
history blame
3 kB
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from ctransformers import AutoModelForCausalLM
import os
import uvicorn
from typing import Optional, List
import logging
# Set up loggings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="Gema 4B Model API", version="1.0.0")
# Request model - fleksibel untuk menerima semua parameter
class TextRequest(BaseModel):
inputs: str
system_prompt: Optional[str] = None
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.7
top_k: Optional[int] = 50
top_p: Optional[float] = 0.9
repeat_penalty: Optional[float] = 1.1
stop: Optional[List[str]] = None
# Response model
class TextResponse(BaseModel):
generated_text: str
# Global model variable
model = None
@app.on_event("startup")
async def load_model():
global model
try:
logger.info("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
"Dnfs/gema-4b-indra10k-model1-Q4_K_M-GGUF",
model_file="gema-4b-indra10k-model1-q4_k_m.gguf",
model_type="llama",
gpu_layers=0, # Set to appropriate number if using GPU
context_length=2048,
threads=os.cpu_count()
)
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise e
@app.post("/generate", response_model=TextResponse)
async def generate_text(request: TextRequest):
if model is None:
raise HTTPException(status_code=500, detail="Model not loaded")
try:
# Buat prompt - gunakan system_prompt jika ada, atau langsung input user
if request.system_prompt:
full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:"
else:
full_prompt = request.inputs
# Generate text dengan parameter dari request
generated_text = model(
full_prompt,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
top_k=request.top_k,
repetition_penalty=request.repeat_penalty,
stop=request.stop or []
)
# Bersihkan response dari system prompt jika ada
if "Assistant:" in generated_text:
generated_text = generated_text.split("Assistant:")[-1].strip()
return TextResponse(generated_text=generated_text)
except Exception as e:
logger.error(f"Generation error: {e}")
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
@app.get("/health")
async def health_check():
return {"status": "healthy", "model_loaded": model is not None}
@app.get("/")
async def root():
return {"message": "Gema 4B Model API", "docs": "/docs"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")