# src/semantic_retriever.py from typing import List, Dict, Any, Tuple, Optional import numpy as np from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema import Document from src.ontology_manager import OntologyManager class SemanticRetriever: """ Enhanced retrieval system that combines vector search with ontology awareness. """ def __init__( self, ontology_manager: OntologyManager, embeddings_model = None, text_chunks: Optional[List[str]] = None ): """ Initialize the semantic retriever. Args: ontology_manager: The ontology manager instance embeddings_model: The embeddings model to use (defaults to OpenAIEmbeddings) text_chunks: Optional list of text chunks to add to the vector store """ self.ontology_manager = ontology_manager self.embeddings = embeddings_model or OpenAIEmbeddings() # Create a vector store with the text representation of the ontology ontology_text = ontology_manager.get_text_representation() self.ontology_chunks = self._split_text(ontology_text) # Add additional text chunks if provided if text_chunks: self.text_chunks = text_chunks all_chunks = self.ontology_chunks + text_chunks else: self.text_chunks = [] all_chunks = self.ontology_chunks # Convert to Document objects for FAISS documents = [Document(page_content=chunk, metadata={"source": "ontology" if i < len(self.ontology_chunks) else "text"}) for i, chunk in enumerate(all_chunks)] # Create the vector store self.vector_store = FAISS.from_documents(documents, self.embeddings) def _split_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: """Split text into chunks for embedding.""" chunks = [] text_length = len(text) for i in range(0, text_length, chunk_size - overlap): chunk = text[i:i + chunk_size] if len(chunk) < 50: # Skip very small chunks continue chunks.append(chunk) return chunks def retrieve(self, query: str, k: int = 4, include_ontology_context: bool = True) -> List[Document]: """ Retrieve relevant documents using a hybrid approach. Args: query: The query string k: Number of documents to retrieve include_ontology_context: Whether to include additional ontology context Returns: A list of retrieved documents """ # Get semantic context from the ontology if include_ontology_context: ontology_context = self.ontology_manager.get_semantic_context(query) else: ontology_context = [] # Perform vector similarity search vector_results = self.vector_store.similarity_search(query, k=k) # Combine results combined_results = vector_results # Add ontology context as additional documents for i, context in enumerate(ontology_context): combined_results.append(Document( page_content=context, metadata={"source": "ontology_context", "context_id": i} )) return combined_results def retrieve_with_paths(self, query: str, k: int = 4) -> Dict[str, Any]: """ Enhanced retrieval that includes semantic paths between entities. Args: query: The query string k: Number of documents to retrieve Returns: A dictionary containing retrieved documents and semantic paths """ # Basic retrieval basic_results = self.retrieve(query, k) # Extract potential entities from the query (simplified approach) # A more sophisticated approach would use NER or entity linking entity_types = ["Product", "Department", "Employee", "Manager", "Customer", "Feedback"] query_words = query.lower().split() potential_entities = [] for entity_type in entity_types: if entity_type.lower() in query_words: # Get instances of this type instances = self.ontology_manager.get_instances_of_class(entity_type) if instances: # Just take the first few for demonstration potential_entities.extend(instances[:2]) # Find paths between potential entities paths = [] if len(potential_entities) >= 2: for i in range(len(potential_entities)): for j in range(i+1, len(potential_entities)): source = potential_entities[i] target = potential_entities[j] # Find paths between these entities entity_paths = self.ontology_manager.find_paths(source, target, max_length=3) if entity_paths: for path in entity_paths: # Convert path to text path_text = self._path_to_text(path) paths.append({ "source": source, "target": target, "path": path, "text": path_text }) # Convert paths to documents path_documents = [] for i, path_info in enumerate(paths): path_documents.append(Document( page_content=path_info["text"], metadata={ "source": "semantic_path", "path_id": i, "source_entity": path_info["source"], "target_entity": path_info["target"] } )) return { "documents": basic_results + path_documents, "paths": paths } def _path_to_text(self, path: List[Dict]) -> str: """Convert a path to a text description.""" if not path: return "" text_parts = [] for edge in path: source = edge["source"] target = edge["target"] relation = edge["type"] # Get entity information source_info = self.ontology_manager.get_entity_info(source) target_info = self.ontology_manager.get_entity_info(target) # Get names if available source_name = source if "properties" in source_info and "name" in source_info["properties"]: source_name = source_info["properties"]["name"] target_name = target if "properties" in target_info and "name" in target_info["properties"]: target_name = target_info["properties"]["name"] # Describe the relationship text_parts.append(f"{source_name} {relation} {target_name}") return " -> ".join(text_parts) def search_by_property(self, class_type: str, property_name: str, property_value: str) -> List[Document]: """ Search for instances of a class with a specific property value. Args: class_type: The class to search in property_name: The property name to match property_value: The property value to match Returns: A list of matched entities as documents """ instances = self.ontology_manager.get_instances_of_class(class_type) results = [] for instance_id in instances: entity_info = self.ontology_manager.get_entity_info(instance_id) if "properties" in entity_info: properties = entity_info["properties"] if property_name in properties: # Simple string matching (could be enhanced with fuzzy matching) if str(properties[property_name]).lower() == property_value.lower(): # Convert to document doc_content = f"Instance: {instance_id}\n" doc_content += f"Type: {class_type}\n" doc_content += "Properties:\n" for prop_name, prop_value in properties.items(): doc_content += f"- {prop_name}: {prop_value}\n" results.append(Document( page_content=doc_content, metadata={ "source": "property_search", "instance_id": instance_id, "class_type": class_type } )) return results