File size: 6,897 Bytes
f6c0c6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
from typing import List, Dict, Any, Optional
from tantivy import Index
import logging
import os
import re
class TantivySearchAgent:
def __init__(self, index_path: str):
"""Initialize the Tantivy search agent with the index path"""
self.index_path = index_path
self.logger = logging.getLogger(__name__)
try:
self.index = Index.open(index_path)
self.logger.info(f"Successfully opened Tantivy index at {index_path}")
except Exception as e:
self.logger.error(f"Failed to open Tantivy index: {e}")
raise
def get_query_instructions(self) -> str:
"""Return instructions for the LLM on how to parse and construct Tantivy queries"""
return """
Instructions for generating a query:
1. Boolean Operators:
- AND: term1 AND term2 (both required)
- OR: term1 OR term2 (either term)
- Multiple words default to OR operation (cloud network = cloud OR network)
- AND takes precedence over OR
- Example: Shabath AND (walk OR go)
2. Field-specific Terms:
- Field-specific terms: field:term
- Example: text:讗讚诐 AND reference:讘专讗砖讬转
- available fields: text, reference, topics
- text contains the text of the document
- reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
- topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.
3. Required/Excluded Terms:
- Required (+): +term (must contain)
- Excluded (-): -term (must not contain)
- Example: +security cloud -deprecated
- Equivalent to: security AND cloud AND NOT deprecated
4. Phrase Search:
- Use quotes: "exact phrase"
- Both single/double quotes work
- Escape quotes with \\"
- Slop operator: "term1 term2"~N
- Example: "cloud security"~2
- the above will find "cloud framework and security "
- Prefix matching: "start of phrase"*
5. Wildcards:
- ? for single character
- * for any number of characters
- Example: sec?rity cloud*
6. Special Features:
- All docs: *
- Boost terms: term^2.0 (positive numbers only)
- Example: security^2.0 cloud
- the above will boost security by 2.0
Query Examples:
1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
2. Field-specific: text:住讬谞讬 AND topics:转谞讱
3. Phrase with slop: "security framework"~2
4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖
Tips:
- Group complex expressions with parentheses
- Use quotes for exact phrases
- Add + for required terms, - for excluded terms
- Boost important terms with ^N
- use field-specific terms for better results.
"""
def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
"""Search the Tantivy index with the given query using Tantivy's query syntax"""
try:
# Create a searcher
searcher = self.index.searcher()
# Parse and execute the query
try:
# First try with lenient parsing
query_parser = self.index.parse_query_lenient(query)
search_results = searcher.search(query_parser[0], num_results).hits
except Exception as query_error:
self.logger.error(f"Lenient query parsing failed: {query_error}")
# Process results
results = []
for score, doc_address in search_results:
doc = searcher.doc(doc_address)
text = doc.get_first("text")
# Extract highlighted snippets based on query terms
# Remove special syntax for highlighting while preserving Hebrew
highlight_terms = re.sub(
r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]',
' ',
query
).strip()
highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
# Create regex pattern for highlighting
if highlight_terms:
# Escape regex special chars but preserve Hebrew
patterns = [re.escape(term) for term in highlight_terms]
pattern = '|'.join(patterns)
# Get surrounding context for matches
matches = list(re.finditer(pattern, text, re.IGNORECASE))
if matches:
highlights = []
for match in matches:
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 50)
highlight = text[start:end]
if start > 0:
highlight = f"...{highlight}"
if end < len(text):
highlight = f"{highlight}..."
highlights.append(highlight)
else:
highlights = [text[:100] + "..." if len(text) > 100 else text]
else:
highlights = [text[:100] + "..." if len(text) > 100 else text]
result = {
"score": float(score),
"title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
"reference": doc.get_first("reference"),
"topics": doc.get_first("topics"),
"file_path": doc.get_first("filePath"),
"line_number": doc.get_first("segment"),
"is_pdf": doc.get_first("isPdf"),
"text": text,
"highlights": highlights
}
results.append(result)
self.logger.info(f"Found {len(results)} results for query: {query}")
return results
except Exception as e:
self.logger.error(f"Error during search: {str(e)}")
return []
def validate_index(self) -> bool:
"""Validate that the index exists and is accessible"""
try:
# Try to create a searcher and perform a simple search
searcher = self.index.searcher()
query_parser = self.index.parse_query("*")
searcher.search(query_parser, 1)
return True
except Exception as e:
self.logger.error(f"Index validation failed: {e}")
return False
|