| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| import os | |
| import uvicorn | |
| app = FastAPI() | |
| # --- Konfigurasi Model --- | |
| # Pastikan sudah download model GGUF dari Hugging Face, contoh: | |
| # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF | |
| MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=2048, # konteks token | |
| n_threads=4, # sesuaikan dengan jumlah CPU core | |
| n_batch=512 # batch size | |
| ) | |
| # --- Schema Request --- | |
| class ChatRequest(BaseModel): | |
| prompt: str | |
| max_new_tokens: int = 256 | |
| # --- Endpoint Chat --- | |
| def chat(req: ChatRequest): | |
| output = llm( | |
| req.prompt, | |
| max_tokens=req.max_new_tokens, | |
| stop=["</s>", "User:", "Assistant:"], | |
| echo=False | |
| ) | |
| response = output["choices"][0]["text"].strip() | |
| return {"response": response} | |
| # --- Root Endpoint --- | |
| def root(): | |
| return {"message": "Qwen GGUF FastAPI running π"} | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| uvicorn.run("app:app", host="0.0.0.0", port=port) | |