paraAI_rag / setup.py
caarleexx's picture
Upload 9 files
cb3b8cb verified
#!/usr/bin/env python3
import os
import sys
import yaml
import json
import subprocess
import logging
from pathlib import Path
from datetime import datetime
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
STATUS_FILE = Path('/tmp/setup_status.json')
READY_FLAG = Path('/tmp/faiss_ready')
def update_status(status, message, progress=0):
data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
with open(STATUS_FILE, 'w') as f:
json.dump(data, f)
logger.info(f"[{progress}%] {status}: {message}")
sys.stdout.flush()
def run_cmd(cmd, desc):
logger.info(f"Executando: {desc}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
logger.error(f"ERRO: {result.stderr}")
raise Exception(f"{desc} falhou")
logger.info(f"✅ {desc}")
return result.stdout
def main():
try:
logger.info("="*80)
logger.info("🚀 PARA.AI RAG (LangChain) - SETUP EM BACKGROUND")
logger.info("="*80)
update_status('loading', 'Carregando configuração', 0)
with open('config.yaml') as f:
config = yaml.safe_load(f)
cluster_id = config['cluster_id']
chunk_start = config['chunk_start']
chunk_end = config['chunk_end']
github_repo = config['github_repo']
if READY_FLAG.exists():
logger.info("✅ FAISS já pronto!")
update_status('ready', 'FAISS já existe', 100)
return
# CLONE
update_status('cloning', 'Clonando chunks (sparse checkout)', 10)
os.makedirs('/tmp/repo', exist_ok=True)
os.chdir('/tmp/repo')
run_cmd(f"git clone --filter=blob:none --sparse {github_repo} .", "Git clone")
run_cmd("git sparse-checkout init --cone", "Sparse checkout init")
patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(chunk_start, chunk_end + 1)]
for i in range(0, len(patterns), 50):
batch = ' '.join(patterns[i:i+50])
run_cmd(f"git sparse-checkout add {batch}", f"Batch {i//50 + 1}")
chunks_count = int(run_cmd("find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l", "Contar chunks").strip())
logger.info(f"✅ {chunks_count} chunks clonados")
# EXTRACT
update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
os.makedirs('/tmp/extracted', exist_ok=True)
run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Descompactar")
# CONCAT
update_status('concatenating', 'Concatenando JSONL', 50)
run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concatenar")
total_records = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Contar registros").strip())
logger.info(f"✅ {total_records:,} registros")
# FILTER
update_status('filtering', 'Filtrando campos (id + ementa)', 60)
os.chdir('/home/user/app')
run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filtrar")
# BUILD FAISS
update_status('building', 'Construindo FAISS index (pode demorar)', 70)
run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS")
# CLEANUP
update_status('cleaning', 'Limpando temporários', 95)
run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Limpar")
# DONE
update_status('ready', f'FAISS pronto com {total_records:,} registros!', 100)
READY_FLAG.touch()
logger.info("="*80)
logger.info("✅ SETUP COMPLETO!")
logger.info("="*80)
except Exception as e:
logger.error(f"❌ ERRO: {e}")
update_status('error', str(e), 0)
sys.exit(1)
if __name__ == "__main__":
main()