# app.py from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama import uvicorn import os MODEL_REPO = "QuantFactory/Qwen2.5-7B-Instruct-GGUF" MODEL_FILE = "Qwen2.5-7B-Instruct.Q4_K_M.gguf" MODEL_PATH = f"/home/user/app/data/cache/{MODEL_FILE}" # تأكد من أن النموذج موجود if not os.path.exists(MODEL_PATH): from huggingface_hub import hf_hub_download os.makedirs("/home/user/app/data/cache", exist_ok=True) hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir="/home/user/app/data/cache", ) if os.path.exists(MODEL_PATH): print(f"Model found at {MODEL_PATH}") else: print(f"Model not found at {MODEL_PATH}") # تحميل النموذج llm = Llama( model_path=MODEL_PATH, n_ctx=16000, n_threads=6, n_gpu_layers=0, verbose=False ) SYSTEM_PROMPT = """<|im_start|>system You are Qwen, created by Alibaba Cloud. You are an AI development assistant. Follow these rules: 1. If request is simple (single file, <50 lines), handle it directly 2. For complex requests (multiple files, >50 lines), just respond with \"CODER\" 3. Always check code for errors before sending 4. Never execute unsafe code<|im_end|>""" # API setup app = FastAPI() class ChatRequest(BaseModel): message: str history: list[list[str]] = [] # [[user, assistant], ...] class ChatResponse(BaseModel): response: str updated_history: list[list[str]] def format_prompt(messages): chat = [] for role, content in messages: if role == "system": chat.append(f"<|im_start|>system\n{content}<|im_end|>") elif role == "user": chat.append(f"<|im_start|>user\n{content}<|im_end|>") else: chat.append(f"<|im_start|>assistant\n{content}<|im_end|>") chat.append("<|im_start|>assistant\n") return "\n".join(chat) @app.post("/chat", response_model=ChatResponse) def chat(req: ChatRequest): messages = [("system", SYSTEM_PROMPT.strip())] for user_msg, bot_msg in req.history: messages.append(("user", user_msg)) messages.append(("assistant", bot_msg)) messages.append(("user", req.message)) prompt = format_prompt(messages) output = llm(prompt, max_tokens=1024, temperature=0.7, top_p=0.9, repeat_penalty=1.05, stop=["<|im_end|>"]) reply = output["choices"][0]["text"].split("<|im_end|>")[0].strip() req.history.append([req.message, reply]) return ChatResponse(response=reply, updated_history=req.history)