| | """ |
| | AetherMap Client |
| | Client para integração com AetherMap API - busca semântica, NER e análise de grafos. |
| | """ |
| | import httpx |
| | import json |
| | import io |
| | from typing import List, Dict, Any, Optional |
| | from dataclasses import dataclass, field |
| | from datetime import datetime |
| | import logging |
| |
|
| | from app.config import settings |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | |
| | AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space') |
| |
|
| |
|
| | @dataclass |
| | class ProcessResult: |
| | """Resultado do processamento de documentos""" |
| | job_id: str |
| | num_documents: int |
| | num_clusters: int |
| | num_noise: int |
| | metrics: Dict[str, Any] = field(default_factory=dict) |
| | cluster_analysis: Dict[str, Any] = field(default_factory=dict) |
| |
|
| |
|
| | @dataclass |
| | class SearchResult: |
| | """Resultado de busca semântica""" |
| | summary: str |
| | results: List[Dict[str, Any]] = field(default_factory=list) |
| |
|
| |
|
| | @dataclass |
| | class EntityNode: |
| | """Nó de entidade no grafo""" |
| | entity: str |
| | entity_type: str |
| | docs: int |
| | degree: int = 0 |
| | centrality: float = 0.0 |
| | role: str = "peripheral" |
| |
|
| |
|
| | @dataclass |
| | class EntityEdge: |
| | """Aresta do grafo de entidades""" |
| | source_entity: str |
| | target_entity: str |
| | weight: int |
| | reason: str |
| |
|
| |
|
| | @dataclass |
| | class EntityGraphResult: |
| | """Resultado da extração de entidades""" |
| | nodes: List[EntityNode] = field(default_factory=list) |
| | edges: List[EntityEdge] = field(default_factory=list) |
| | hubs: List[Dict[str, Any]] = field(default_factory=list) |
| | insights: Dict[str, Any] = field(default_factory=dict) |
| |
|
| |
|
| | @dataclass |
| | class GraphAnalysis: |
| | """Análise do grafo via LLM""" |
| | analysis: str |
| | key_entities: List[str] = field(default_factory=list) |
| | relationships: List[str] = field(default_factory=list) |
| |
|
| |
|
| | class AetherMapClient: |
| | """ |
| | Client para AetherMap API. |
| | |
| | Funcionalidades: |
| | - Processamento de documentos (embeddings + clusters) |
| | - Busca semântica RAG (FAISS + BM25 + reranking + LLM) |
| | - Extração de entidades NER |
| | - Análise de grafo via LLM |
| | """ |
| | |
| | def __init__(self, base_url: str = None, timeout: float = 1800.0): |
| | self.base_url = (base_url or AETHERMAP_URL).rstrip('/') |
| | self.timeout = timeout |
| | self._current_job_id: Optional[str] = None |
| | |
| | @property |
| | def current_job_id(self) -> Optional[str]: |
| | """Retorna o job_id atual""" |
| | return self._current_job_id |
| | |
| | async def process_documents( |
| | self, |
| | texts: List[str], |
| | fast_mode: bool = True, |
| | min_cluster_size: int = 0, |
| | min_samples: int = 0 |
| | ) -> ProcessResult: |
| | """ |
| | Processa uma lista de textos gerando embeddings e clusters. |
| | |
| | Args: |
| | texts: Lista de textos/documentos |
| | fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso) |
| | min_cluster_size: Tamanho mínimo do cluster (0=auto) |
| | min_samples: Mínimo de amostras (0=auto) |
| | |
| | Returns: |
| | ProcessResult com job_id e métricas |
| | """ |
| | |
| | content = "\n".join(texts) |
| | file_bytes = content.encode('utf-8') |
| | |
| | try: |
| | async with httpx.AsyncClient(timeout=self.timeout) as client: |
| | files = { |
| | 'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain') |
| | } |
| | data = { |
| | 'n_samples': str(len(texts)), |
| | 'fast_mode': 'true' if fast_mode else 'false', |
| | 'min_cluster_size': str(min_cluster_size), |
| | 'min_samples': str(min_samples) |
| | } |
| | |
| | logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/") |
| | |
| | response = await client.post( |
| | f"{self.base_url}/process/", |
| | files=files, |
| | data=data |
| | ) |
| | |
| | logger.info(f"AetherMap: Response status {response.status_code}") |
| | |
| | if response.status_code != 200: |
| | error_text = response.text[:500] if response.text else "No response body" |
| | logger.error(f"AetherMap error: {response.status_code} - {error_text}") |
| | raise Exception(f"AetherMap error: {response.status_code} - {error_text}") |
| | |
| | result = response.json() |
| | |
| | self._current_job_id = result.get('job_id') |
| | metadata = result.get('metadata', {}) |
| | |
| | logger.info(f"AetherMap: Job criado {self._current_job_id}") |
| | |
| | return ProcessResult( |
| | job_id=self._current_job_id or "unknown", |
| | num_documents=metadata.get('num_documents_processed', len(texts)), |
| | num_clusters=metadata.get('num_clusters_found', 0), |
| | num_noise=metadata.get('num_noise_points', 0), |
| | metrics=result.get('metrics', {}), |
| | cluster_analysis=result.get('cluster_analysis', {}) |
| | ) |
| | except httpx.TimeoutException: |
| | logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}") |
| | raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.") |
| | except httpx.ConnectError as e: |
| | logger.error(f"AetherMap: Erro de conexão: {e}") |
| | raise Exception(f"Erro de conexão com AetherMap: {e}") |
| | except Exception as e: |
| | logger.error(f"AetherMap: Erro inesperado: {e}") |
| | raise |
| | |
| | async def semantic_search( |
| | self, |
| | query: str, |
| | job_id: str = None, |
| | turbo_mode: bool = False |
| | ) -> SearchResult: |
| | """ |
| | Busca semântica RAG híbrida nos documentos processados. |
| | |
| | Args: |
| | query: Termo de busca |
| | job_id: ID do job (se não fornecido, usa o último) |
| | turbo_mode: Se True, busca mais rápida (menos precisa) |
| | |
| | Returns: |
| | SearchResult com resumo e resultados |
| | """ |
| | job_id = job_id or self._current_job_id |
| | if not job_id: |
| | raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
| | |
| | async with httpx.AsyncClient(timeout=self.timeout) as client: |
| | data = { |
| | 'query': query, |
| | 'job_id': job_id, |
| | 'turbo_mode': 'true' if turbo_mode else 'false' |
| | } |
| | |
| | logger.info(f"AetherMap: Buscando '{query}'...") |
| | |
| | response = await client.post( |
| | f"{self.base_url}/search/", |
| | data=data |
| | ) |
| | |
| | if response.status_code != 200: |
| | raise Exception(f"AetherMap search error: {response.status_code} - {response.text}") |
| | |
| | result = response.json() |
| | |
| | return SearchResult( |
| | summary=result.get('summary', ''), |
| | results=result.get('results', []) |
| | ) |
| | |
| | async def extract_entities(self, job_id: str = None) -> EntityGraphResult: |
| | """ |
| | Extrai entidades nomeadas (NER) e cria grafo de conexões. |
| | |
| | Args: |
| | job_id: ID do job (se não fornecido, usa o último) |
| | |
| | Returns: |
| | EntityGraphResult com nós, arestas e insights |
| | """ |
| | job_id = job_id or self._current_job_id |
| | if not job_id: |
| | raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
| | |
| | async with httpx.AsyncClient(timeout=self.timeout) as client: |
| | data = {'job_id': job_id} |
| | |
| | logger.info(f"AetherMap: Extraindo entidades...") |
| | |
| | response = await client.post( |
| | f"{self.base_url}/entity_graph/", |
| | data=data |
| | ) |
| | |
| | if response.status_code != 200: |
| | raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}") |
| | |
| | result = response.json() |
| | |
| | |
| | nodes = [ |
| | EntityNode( |
| | entity=n.get('entity', ''), |
| | entity_type=n.get('type', ''), |
| | docs=n.get('docs', 0), |
| | degree=n.get('degree', 0), |
| | centrality=n.get('centrality', 0.0), |
| | role=n.get('role', 'peripheral') |
| | ) |
| | for n in result.get('nodes', []) |
| | ] |
| | |
| | edges = [ |
| | EntityEdge( |
| | source_entity=e.get('source_entity', ''), |
| | target_entity=e.get('target_entity', ''), |
| | weight=e.get('weight', 0), |
| | reason=e.get('reason', '') |
| | ) |
| | for e in result.get('edges', []) |
| | ] |
| | |
| | return EntityGraphResult( |
| | nodes=nodes, |
| | edges=edges, |
| | hubs=result.get('hubs', []), |
| | insights=result.get('insights', {}) |
| | ) |
| | |
| | async def analyze_graph(self, job_id: str = None) -> GraphAnalysis: |
| | """ |
| | Usa LLM para analisar o Knowledge Graph e extrair insights. |
| | |
| | Args: |
| | job_id: ID do job (se não fornecido, usa o último) |
| | |
| | Returns: |
| | GraphAnalysis com análise textual |
| | """ |
| | job_id = job_id or self._current_job_id |
| | if not job_id: |
| | raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
| | |
| | async with httpx.AsyncClient(timeout=self.timeout) as client: |
| | data = {'job_id': job_id} |
| | |
| | logger.info(f"AetherMap: Analisando grafo com LLM...") |
| | |
| | response = await client.post( |
| | f"{self.base_url}/analyze_graph/", |
| | data=data |
| | ) |
| | |
| | if response.status_code != 200: |
| | raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}") |
| | |
| | result = response.json() |
| | |
| | return GraphAnalysis( |
| | analysis=result.get('analysis', ''), |
| | key_entities=result.get('key_entities', []), |
| | relationships=result.get('relationships', []) |
| | ) |
| | |
| | async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]: |
| | """ |
| | Usa LLM para descrever cada cluster encontrado. |
| | |
| | Args: |
| | job_id: ID do job (se não fornecido, usa o último) |
| | |
| | Returns: |
| | Dict com insights por cluster |
| | """ |
| | job_id = job_id or self._current_job_id |
| | if not job_id: |
| | raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
| | |
| | async with httpx.AsyncClient(timeout=self.timeout) as client: |
| | data = {'job_id': job_id} |
| | |
| | logger.info(f"AetherMap: Descrevendo clusters...") |
| | |
| | response = await client.post( |
| | f"{self.base_url}/describe_clusters/", |
| | data=data |
| | ) |
| | |
| | if response.status_code != 200: |
| | raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}") |
| | |
| | return response.json() |
| |
|
| |
|
| | |
| | aethermap = AetherMapClient() |
| |
|