from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download import os import psutil import multiprocessing app = FastAPI(title="LLM Agent API", version="1.0.0") class ChatRequest(BaseModel): message: str max_tokens: int = 100 temperature: float = 0.7 class ChatResponse(BaseModel): response: str class LocalLLMAgent: def __init__(self): # Download do modelo se não existir model_path = "./llama-2-7b-chat.Q4_K_M.gguf" if not os.path.exists(model_path): print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True) print(" Isso pode levar alguns minutos...", flush=True) model_path = hf_hub_download( repo_id="TheBloke/Llama-2-7B-Chat-GGUF", filename="llama-2-7b-chat.Q4_K_M.gguf", local_dir="./" ) print("✅ Modelo baixado com sucesso!", flush=True) else: print("📁 Modelo já existe, carregando...", flush=True) # Configura para usar todas as CPUs disponíveis n_threads = multiprocessing.cpu_count() print(f"🔧 Configurando llama-cpp-python:", flush=True) print(f" - CPUs disponíveis: {n_threads}", flush=True) print(f" - Threads: {n_threads}", flush=True) print(f" - Contexto: 2048 tokens", flush=True) print("🚀 Inicializando modelo...", flush=True) self.llm = Llama( model_path=model_path, chat_format="llama-2", n_ctx=2048, n_threads=n_threads, n_threads_batch=n_threads, verbose=False ) print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True) self.messages = [ {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."} ] def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str: print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}") print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}") self.messages.append({"role": "user", "content": message}) response = self.llm.create_chat_completion( messages=self.messages, max_tokens=max_tokens, temperature=temperature ) assistant_message = response['choices'][0]['message']['content'] self.messages.append({"role": "assistant", "content": assistant_message}) print(f"✅ Resposta gerada ({len(assistant_message)} chars)") return assistant_message # Inicializa o agente globalmente agent = None @app.on_event("startup") async def startup_event(): print("=== INICIANDO LLM AGENT API ===", flush=True) print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True) print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True) global agent agent = LocalLLMAgent() print("✅ API pronta para uso!", flush=True) print("Endpoints disponíveis:", flush=True) print(" - POST /chat", flush=True) print(" - GET /health", flush=True) print(" - GET /system", flush=True) @app.post("/chat", response_model=ChatResponse) async def chat_endpoint(request: ChatRequest): if agent is None: return ChatResponse(response="Modelo ainda carregando, tente novamente.") response = agent.chat(request.message, request.max_tokens, request.temperature) return ChatResponse(response=response) @app.get("/health") async def health_check(): return {"status": "healthy"} @app.get("/system") async def system_info(): cpu_count = multiprocessing.cpu_count() cpu_percent = psutil.cpu_percent(interval=1, percpu=True) memory = psutil.virtual_memory() return { "cpu_cores": cpu_count, "cpu_usage_per_core": cpu_percent, "cpu_usage_total": psutil.cpu_percent(interval=1), "memory_total_gb": round(memory.total / (1024**3), 2), "memory_used_gb": round(memory.used / (1024**3), 2), "memory_percent": memory.percent } # Removido - uvicorn será executado pelo Dockerfile