from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import psutil
import multiprocessing

app = FastAPI(title="LLM Agent API", version="1.0.0")

class ChatRequest(BaseModel):
    message: str
    max_tokens: int = 100
    temperature: float = 0.7

class ChatResponse(BaseModel):
    response: str

class LocalLLMAgent:
    def __init__(self):
        # Download do modelo se não existir
        model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
        
        if not os.path.exists(model_path):
            print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True)
            print("   Isso pode levar alguns minutos...", flush=True)
            model_path = hf_hub_download(
                repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
                filename="llama-2-7b-chat.Q4_K_M.gguf",
                local_dir="./"
            )
            print("✅ Modelo baixado com sucesso!", flush=True)
        else:
            print("📁 Modelo já existe, carregando...", flush=True)
        
        # Configura para usar todas as CPUs disponíveis
        n_threads = multiprocessing.cpu_count()
        print(f"🔧 Configurando llama-cpp-python:", flush=True)
        print(f"   - CPUs disponíveis: {n_threads}", flush=True)
        print(f"   - Threads: {n_threads}", flush=True)
        print(f"   - Contexto: 2048 tokens", flush=True)
        
        print("🚀 Inicializando modelo...", flush=True)
        self.llm = Llama(
            model_path=model_path,
            chat_format="llama-2",
            n_ctx=2048,
            n_threads=n_threads,
            n_threads_batch=n_threads,
            verbose=False
        )
        print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True)
        self.messages = [
            {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
        ]
    
    def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
        print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
        print(f"   Parâmetros: max_tokens={max_tokens}, temperature={temperature}")
        
        self.messages.append({"role": "user", "content": message})
        
        response = self.llm.create_chat_completion(
            messages=self.messages,
            max_tokens=max_tokens,
            temperature=temperature
        )
        
        assistant_message = response['choices'][0]['message']['content']
        self.messages.append({"role": "assistant", "content": assistant_message})
        
        print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
        return assistant_message

# Inicializa o agente globalmente
agent = None

@app.on_event("startup")
async def startup_event():
    print("=== INICIANDO LLM AGENT API ===", flush=True)
    print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True)
    print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True)
    
    global agent
    agent = LocalLLMAgent()
    
    print("✅ API pronta para uso!", flush=True)
    print("Endpoints disponíveis:", flush=True)
    print("  - POST /chat", flush=True)
    print("  - GET /health", flush=True)
    print("  - GET /system", flush=True)

@app.post("/chat", response_model=ChatResponse)
async def chat_endpoint(request: ChatRequest):
    if agent is None:
        return ChatResponse(response="Modelo ainda carregando, tente novamente.")
    response = agent.chat(request.message, request.max_tokens, request.temperature)
    return ChatResponse(response=response)

@app.get("/health")
async def health_check():
    return {"status": "healthy"}

@app.get("/system")
async def system_info():
    cpu_count = multiprocessing.cpu_count()
    cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
    memory = psutil.virtual_memory()
    
    return {
        "cpu_cores": cpu_count,
        "cpu_usage_per_core": cpu_percent,
        "cpu_usage_total": psutil.cpu_percent(interval=1),
        "memory_total_gb": round(memory.total / (1024**3), 2),
        "memory_used_gb": round(memory.used / (1024**3), 2),
        "memory_percent": memory.percent
    }

# Removido - uvicorn será executado pelo Dockerfile