Spaces:

sivan22
/

Ituria

Running

File size: 6,897 Bytes

f6c0c6c

from typing import List, Dict, Any, Optional
from tantivy import Index
import logging
import os
import re


class TantivySearchAgent:
    def __init__(self, index_path: str):
        """Initialize the Tantivy search agent with the index path"""
        self.index_path = index_path
        self.logger = logging.getLogger(__name__)
        try:
            self.index = Index.open(index_path)
            self.logger.info(f"Successfully opened Tantivy index at {index_path}")
        except Exception as e:
            self.logger.error(f"Failed to open Tantivy index: {e}")
            raise

    def get_query_instructions(self) -> str:
        """Return instructions for the LLM on how to parse and construct Tantivy queries"""
        return """

Instructions for generating a query:



1. Boolean Operators:



   - AND: term1 AND term2 (both required)

   - OR: term1 OR term2 (either term)

   - Multiple words default to OR operation (cloud network = cloud OR network)

   - AND takes precedence over OR

   - Example: Shabath AND (walk OR go)



2. Field-specific Terms:

   - Field-specific terms: field:term

   - Example: text:אדם AND reference:בראשית

   - available fields: text, reference, topics

   - text contains the text of the document

   - reference contains the citation of the document, e.g. בראשית, פרק א

   - topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc.



3. Required/Excluded Terms:

   - Required (+): +term (must contain)

   - Excluded (-): -term (must not contain)

   - Example: +security cloud -deprecated

   - Equivalent to: security AND cloud AND NOT deprecated



4. Phrase Search:

   - Use quotes: "exact phrase"

   - Both single/double quotes work

   - Escape quotes with \\"

   - Slop operator: "term1 term2"~N 

   - Example: "cloud security"~2 

   - the above will find "cloud framework and security "

   - Prefix matching: "start of phrase"*



5. Wildcards:

   - ? for single character

   - * for any number of characters

   - Example: sec?rity cloud*



6. Special Features:

   - All docs: * 

   - Boost terms: term^2.0 (positive numbers only)

   - Example: security^2.0 cloud

   - the above will boost security by 2.0

   

Query Examples:

1. Basic: +שבת +חולה +אסור

2. Field-specific: text:סיני AND topics:תנך

3. Phrase with slop: "security framework"~2

4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים

6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש



Tips:

- Group complex expressions with parentheses

- Use quotes for exact phrases

- Add + for required terms, - for excluded terms

- Boost important terms with ^N

- use field-specific terms for better results. 

"""

    def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
        """Search the Tantivy index with the given query using Tantivy's query syntax"""
        try:
            # Create a searcher
            searcher = self.index.searcher()
     
            # Parse and execute the query
            try:
                # First try with lenient parsing
                query_parser = self.index.parse_query_lenient(query)
                search_results = searcher.search(query_parser[0], num_results).hits
                
            except Exception as query_error:
                self.logger.error(f"Lenient query parsing failed: {query_error}")
            
            # Process results
            results = []
            for score, doc_address in search_results:
                doc = searcher.doc(doc_address)
                text = doc.get_first("text")
                
                # Extract highlighted snippets based on query terms
                # Remove special syntax for highlighting while preserving Hebrew
                highlight_terms = re.sub(
                    r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]', 
                    ' ', 
                    query
                ).strip()
                highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
                
                # Create regex pattern for highlighting
                if highlight_terms:
                    # Escape regex special chars but preserve Hebrew
                    patterns = [re.escape(term) for term in highlight_terms]
                    pattern = '|'.join(patterns)
                    # Get surrounding context for matches
                    matches = list(re.finditer(pattern, text, re.IGNORECASE))
                    if matches:
                        highlights = []
                        for match in matches:
                            start = max(0, match.start() - 50)
                            end = min(len(text), match.end() + 50)
                            highlight = text[start:end]
                            if start > 0:
                                highlight = f"...{highlight}"
                            if end < len(text):
                                highlight = f"{highlight}..."
                            highlights.append(highlight)
                    else:
                        highlights = [text[:100] + "..." if len(text) > 100 else text]
                else:
                    highlights = [text[:100] + "..." if len(text) > 100 else text]
                
                result = {
                    "score": float(score),
                    "title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
                    "reference": doc.get_first("reference"),
                    "topics": doc.get_first("topics"),
                    "file_path": doc.get_first("filePath"),
                    "line_number": doc.get_first("segment"),
                    "is_pdf": doc.get_first("isPdf"),
                    "text": text,
                    "highlights": highlights
                }
                results.append(result)
            
            self.logger.info(f"Found {len(results)} results for query: {query}")
            return results
            
        except Exception as e:
            self.logger.error(f"Error during search: {str(e)}")
            return []

    def validate_index(self) -> bool:
        """Validate that the index exists and is accessible"""
        try:
            # Try to create a searcher and perform a simple search
            searcher = self.index.searcher()
            query_parser = self.index.parse_query("*")
            searcher.search(query_parser, 1)
            return True
        except Exception as e:
            self.logger.error(f"Index validation failed: {e}")
            return False