qwen_api / app_quantized.py
aryo100's picture
updare app & requirements
2b65d25
raw
history blame
1.19 kB
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os
import uvicorn
app = FastAPI()
# --- Konfigurasi Model ---
# Pastikan sudah download model GGUF dari Hugging Face, contoh:
# https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal
llm = Llama(
model_path=MODEL_PATH,
n_ctx=2048, # konteks token
n_threads=4, # sesuaikan dengan jumlah CPU core
n_batch=512 # batch size
)
# --- Schema Request ---
class ChatRequest(BaseModel):
prompt: str
max_new_tokens: int = 256
# --- Endpoint Chat ---
@app.post("/chat")
def chat(req: ChatRequest):
output = llm(
req.prompt,
max_tokens=req.max_new_tokens,
stop=["</s>", "User:", "Assistant:"],
echo=False
)
response = output["choices"][0]["text"].strip()
return {"response": response}
# --- Root Endpoint ---
@app.get("/")
def root():
return {"message": "Qwen GGUF FastAPI running πŸš€"}
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run("app:app", host="0.0.0.0", port=port)