| from fastapi import FastAPI |
| from pydantic import BaseModel |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
| import multiprocessing |
|
|
| app = FastAPI() |
|
|
| |
| |
| |
|
|
| MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF" |
| MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf" |
|
|
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE |
| ) |
|
|
| |
| |
| |
|
|
| llm = Llama( |
| model_path=model_path, |
|
|
| |
| n_ctx=8192, |
|
|
| |
| n_threads=multiprocessing.cpu_count(), |
|
|
| |
| n_gpu_layers=0, |
|
|
| |
| n_batch=512, |
| use_mmap=True, |
| use_mlock=True, |
| ) |
|
|
| |
| |
| |
|
|
| class ChatRequest(BaseModel): |
| message: str |
|
|
| |
| |
| |
|
|
| @app.get("/") |
| def root(): |
| return {"status": "Strategy AI engine running"} |
|
|
| |
| |
| |
|
|
| @app.post("/chat") |
| def chat(req: ChatRequest): |
|
|
| |
| system_prompt = ( |
| "<|system|>" |
| "You are an elite strategic intelligence AI. " |
| "Think step-by-step before answering. " |
| "Provide deep analysis, structured reasoning, and clear actionable insights. " |
| "Use bullet points, numbered steps, and markdown formatting." |
| "<|end|>" |
| ) |
|
|
| prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" |
|
|
| output = llm( |
| prompt, |
|
|
| |
| max_tokens=900, |
|
|
| |
| temperature=0.35, |
|
|
| |
| top_p=0.9, |
|
|
| |
| repeat_penalty=1.2, |
|
|
| stop=["<|end|>"] |
| ) |
|
|
| response_text = output["choices"][0]["text"].strip() |
|
|
| return {"reply": response_text} |
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|