llm-agent-api / app.py
caiocampos-hotmart
chore: improve logs
fcba0d8
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import psutil
import multiprocessing
app = FastAPI(title="LLM Agent API", version="1.0.0")
class ChatRequest(BaseModel):
message: str
max_tokens: int = 100
temperature: float = 0.7
class ChatResponse(BaseModel):
response: str
class LocalLLMAgent:
def __init__(self):
# Download do modelo se não existir
model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
if not os.path.exists(model_path):
print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True)
print(" Isso pode levar alguns minutos...", flush=True)
model_path = hf_hub_download(
repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
filename="llama-2-7b-chat.Q4_K_M.gguf",
local_dir="./"
)
print("✅ Modelo baixado com sucesso!", flush=True)
else:
print("📁 Modelo já existe, carregando...", flush=True)
# Configura para usar todas as CPUs disponíveis
n_threads = multiprocessing.cpu_count()
print(f"🔧 Configurando llama-cpp-python:", flush=True)
print(f" - CPUs disponíveis: {n_threads}", flush=True)
print(f" - Threads: {n_threads}", flush=True)
print(f" - Contexto: 2048 tokens", flush=True)
print("🚀 Inicializando modelo...", flush=True)
self.llm = Llama(
model_path=model_path,
chat_format="llama-2",
n_ctx=2048,
n_threads=n_threads,
n_threads_batch=n_threads,
verbose=False
)
print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True)
self.messages = [
{"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
]
def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}")
self.messages.append({"role": "user", "content": message})
response = self.llm.create_chat_completion(
messages=self.messages,
max_tokens=max_tokens,
temperature=temperature
)
assistant_message = response['choices'][0]['message']['content']
self.messages.append({"role": "assistant", "content": assistant_message})
print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
return assistant_message
# Inicializa o agente globalmente
agent = None
@app.on_event("startup")
async def startup_event():
print("=== INICIANDO LLM AGENT API ===", flush=True)
print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True)
print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True)
global agent
agent = LocalLLMAgent()
print("✅ API pronta para uso!", flush=True)
print("Endpoints disponíveis:", flush=True)
print(" - POST /chat", flush=True)
print(" - GET /health", flush=True)
print(" - GET /system", flush=True)
@app.post("/chat", response_model=ChatResponse)
async def chat_endpoint(request: ChatRequest):
if agent is None:
return ChatResponse(response="Modelo ainda carregando, tente novamente.")
response = agent.chat(request.message, request.max_tokens, request.temperature)
return ChatResponse(response=response)
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.get("/system")
async def system_info():
cpu_count = multiprocessing.cpu_count()
cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
memory = psutil.virtual_memory()
return {
"cpu_cores": cpu_count,
"cpu_usage_per_core": cpu_percent,
"cpu_usage_total": psutil.cpu_percent(interval=1),
"memory_total_gb": round(memory.total / (1024**3), 2),
"memory_used_gb": round(memory.used / (1024**3), 2),
"memory_percent": memory.percent
}
# Removido - uvicorn será executado pelo Dockerfile