File size: 15,198 Bytes
6ea41ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
"""Query ranker UI component and logic."""

import gradio as gr
import logging

from src.core.logging_config import get_logger
from src.rag.vector_store import vector_store_manager
from src.rag import document_ingestion_service

logger = get_logger(__name__)


def handle_query_search(query, method, k_value):
    """Handle query search and return formatted results."""
    if not query or not query.strip():
        return """
        <div class="ranker-container">
            <div class="ranker-placeholder">
                <h3>πŸ” Query Ranker</h3>
                <p>Enter a search query to find relevant document chunks with similarity scores.</p>
            </div>
        </div>
        """
    
    try:
        logger.info(f"Query search: '{query[:50]}...' using method: {method}")
        
        # Get results based on method
        results = []
        if method == "similarity":
            retriever = vector_store_manager.get_retriever("similarity", {"k": k_value})
            docs = retriever.invoke(query)
            # Try to get actual similarity scores
            try:
                vector_store = vector_store_manager.get_vector_store()
                if hasattr(vector_store, 'similarity_search_with_score'):
                    docs_with_scores = vector_store.similarity_search_with_score(query, k=k_value)
                    for i, (doc, score) in enumerate(docs_with_scores):
                        similarity_score = max(0, 1 - score) if score is not None else 0.8
                        results.append(_format_ranker_result(doc, similarity_score, i + 1))
                else:
                    # Fallback without scores
                    for i, doc in enumerate(docs):
                        score = 0.85 - (i * 0.05)
                        results.append(_format_ranker_result(doc, score, i + 1))
            except Exception as e:
                logger.warning(f"Could not get similarity scores: {e}")
                for i, doc in enumerate(docs):
                    score = 0.85 - (i * 0.05)
                    results.append(_format_ranker_result(doc, score, i + 1))
                    
        elif method == "mmr":
            retriever = vector_store_manager.get_retriever("mmr", {"k": k_value, "fetch_k": k_value * 2, "lambda_mult": 0.5})
            docs = retriever.invoke(query)
            for i, doc in enumerate(docs):
                results.append(_format_ranker_result(doc, None, i + 1))  # No score for MMR
                
        elif method == "bm25":
            retriever = vector_store_manager.get_bm25_retriever(k=k_value)
            docs = retriever.invoke(query)
            for i, doc in enumerate(docs):
                results.append(_format_ranker_result(doc, None, i + 1))  # No score for BM25
                
        elif method == "hybrid":
            retriever = vector_store_manager.get_hybrid_retriever(k=k_value, semantic_weight=0.7, keyword_weight=0.3)
            docs = retriever.invoke(query)
            for i, doc in enumerate(docs):
                results.append(_format_ranker_result(doc, None, i + 1))  # No score for hybrid
        
        logger.info(f"Retrieved {len(results)} results for query using {method}")
        return _format_ranker_results_html(results, query, method)
        
    except Exception as e:
        error_msg = f"Error during search: {str(e)}"
        logger.error(error_msg)
        return f"""
        <div class="ranker-container">
            <div class="ranker-error">
                <h3>❌ Search Error</h3>
                <p>{error_msg}</p>
                <p class="error-hint">Make sure documents are uploaded and the system is ready.</p>
            </div>
        </div>
        """


def _format_ranker_result(doc, score, rank):
    """Format a single search result."""
    # Extract metadata
    metadata = doc.metadata
    source = metadata.get("source", "Unknown")
    page = metadata.get("page", "N/A")
    chunk_id = metadata.get("chunk_id", "Unknown")
    
    # Calculate content length and create indicator
    content_length = len(doc.page_content)
    if content_length < 200:
        length_indicator = f"πŸ“ {content_length} chars"
    elif content_length < 500:
        length_indicator = f"πŸ“„ {content_length} chars"
    else:
        length_indicator = f"πŸ“š {content_length} chars"
    
    # Calculate confidence based on rank (high confidence for top results)
    if rank <= 2:
        confidence = "High"
        confidence_color = "#28a745"
        confidence_icon = "πŸ”₯"
    elif rank <= 4:
        confidence = "Medium"
        confidence_color = "#ffc107"
        confidence_icon = "⭐"
    else:
        confidence = "Low"
        confidence_color = "#6c757d"
        confidence_icon = "πŸ’‘"
    
    result = {
        "rank": rank,
        "content": doc.page_content,
        "source": source,
        "page": page,
        "chunk_id": chunk_id,
        "length_indicator": length_indicator,
        "has_score": score is not None,
        "confidence": confidence,
        "confidence_color": confidence_color,
        "confidence_icon": confidence_icon
    }
    
    # Only add score if we have a real score (similarity search only)
    if score is not None:
        result["score"] = round(score, 3)
    
    return result


def _format_ranker_results_html(results, query, method):
    """Format search results as HTML."""
    if not results:
        return """
        <div class="ranker-container">
            <div class="ranker-no-results">
                <h3>πŸ” No Results Found</h3>
                <p>No relevant documents found for your query.</p>
                <p class="no-results-hint">Try different keywords or check if documents are uploaded.</p>
            </div>
        </div>
        """
    
    # Method display names
    method_labels = {
        "similarity": "🎯 Similarity Search",
        "mmr": "πŸ”€ MMR (Diverse)",
        "bm25": "πŸ” BM25 (Keywords)",
        "hybrid": "πŸ”— Hybrid (Recommended)"
    }
    method_display = method_labels.get(method, method)
    
    # Start building HTML
    html_parts = [f"""
    <div class="ranker-container">
        <div class="ranker-header">
            <div class="ranker-title">
                <h3>πŸ” Search Results</h3>
                <div class="query-display">"{query}"</div>
            </div>
            <div class="ranker-meta">
                <span class="method-badge">{method_display}</span>
                <span class="result-count">{len(results)} results</span>
            </div>
        </div>
    """]
    
    # Add results
    for result in results:
        rank_emoji = ["πŸ₯‡", "πŸ₯ˆ", "πŸ₯‰"][result["rank"] - 1] if result["rank"] <= 3 else f"#{result['rank']}"
        
        # Escape content for safe HTML inclusion and JavaScript
        escaped_content = result['content'].replace('"', '&quot;').replace("'", "&#39;").replace('\n', '\\n')
        
        # Build score info - always show confidence, only show score for similarity search
        score_info_parts = [f"""
                    <span class="confidence-badge" style="color: {result['confidence_color']}">
                        {result['confidence_icon']} {result['confidence']}
                    </span>"""]
        
        # Only add score value if we have real scores (similarity search)
        if result.get('has_score', False):
            score_info_parts.append(f'<span class="score-value">🎯 {result["score"]}</span>')
        
        score_info_html = f"""
                <div class="score-info">
                    {''.join(score_info_parts)}
                </div>"""
        
        html_parts.append(f"""
        <div class="result-card">
            <div class="result-header">
                <div class="rank-info">
                    <span class="rank-badge">{rank_emoji} Rank {result['rank']}</span>
                    <span class="source-info">πŸ“„ {result['source']}</span>
                    {f"<span class='page-info'>Page {result['page']}</span>" if result['page'] != 'N/A' else ""}
                    <span class="length-info">{result['length_indicator']}</span>
                </div>
                {score_info_html}
            </div>
            <div class="result-content">
                <div class="content-text">{result['content']}</div>
            </div>
        </div>
        """)
    
    html_parts.append("</div>")
    
    return "".join(html_parts)


def get_ranker_status():
    """Get current ranker system status."""
    try:
        # Get collection info
        collection_info = vector_store_manager.get_collection_info()
        document_count = collection_info.get("document_count", 0)
        
        # Get available methods
        available_methods = ["similarity", "mmr", "bm25", "hybrid"]
        
        # Check if system is ready
        ingestion_status = document_ingestion_service.get_ingestion_status()
        system_ready = ingestion_status.get('system_ready', False)
        
        status_html = f"""
        <div class="status-card">
            <div class="status-header">
                <h3>πŸ” Query Ranker Status</h3>
                <div class="status-indicator {'status-ready' if system_ready else 'status-not-ready'}">
                    {'🟒 READY' if system_ready else 'πŸ”΄ NOT READY'}
                </div>
            </div>
            
            <div class="status-grid">
                <div class="status-item">
                    <div class="status-label">Available Documents</div>
                    <div class="status-value">{document_count}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Retrieval Methods</div>
                    <div class="status-value">{len(available_methods)}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Vector Store</div>
                    <div class="status-value">{'Ready' if system_ready else 'Not Ready'}</div>
                </div>
            </div>
            
            <div class="ranker-methods">
                <div class="methods-label">Available Methods:</div>
                <div class="methods-list">
                    <span class="method-tag">🎯 Similarity</span>
                    <span class="method-tag">πŸ”€ MMR</span>
                    <span class="method-tag">πŸ” BM25</span>
                    <span class="method-tag">πŸ”— Hybrid</span>
                </div>
            </div>
        </div>
        """
        
        return status_html
        
    except Exception as e:
        error_msg = f"Error getting ranker status: {str(e)}"
        logger.error(error_msg)
        return f"""
        <div class="status-card status-error">
            <div class="status-header">
                <h3>❌ System Error</h3>
            </div>
            <p class="error-message">{error_msg}</p>
        </div>
        """


def create_query_ranker_tab():
    """Create the query ranker tab UI."""
    with gr.TabItem("πŸ” Query Ranker"):
        with gr.Column(elem_classes=["ranker-container"]):
            # Header
            gr.HTML("""
            <div class="chat-header">
                <h2>πŸ” Query Ranker</h2>
                <p>Search and rank document chunks with transparency into retrieval methods</p>
            </div>
            """)
            
            # Status display
            status_display = gr.HTML(value=get_ranker_status())
            
            # Control buttons
            with gr.Row(elem_classes=["control-buttons"]):
                refresh_ranker_status_btn = gr.Button("πŸ”„ Refresh Status", elem_classes=["control-btn", "btn-refresh"])
                clear_results_btn = gr.Button("πŸ—‘οΈ Clear Results", elem_classes=["control-btn", "btn-clear-data"])
            
            # Search controls
            with gr.Column(elem_classes=["ranker-controls"]):
                with gr.Row(elem_classes=["ranker-input-row"]):
                    query_input = gr.Textbox(
                        placeholder="Enter your search query...",
                        show_label=False,
                        elem_classes=["ranker-query-input"],
                        scale=4
                    )
                    search_btn = gr.Button("πŸ” Search", elem_classes=["ranker-search-btn"], scale=0)
                
                with gr.Row(elem_classes=["ranker-options-row"]):
                    method_dropdown = gr.Dropdown(
                        choices=[
                            ("🎯 Similarity Search", "similarity"),
                            ("πŸ”€ MMR (Diverse)", "mmr"),
                            ("πŸ” BM25 (Keywords)", "bm25"),
                            ("πŸ”— Hybrid (Recommended)", "hybrid")
                        ],
                        value="hybrid",
                        label="Retrieval Method",
                        scale=2
                    )
                    k_slider = gr.Slider(
                        minimum=1,
                        maximum=10,
                        value=5,
                        step=1,
                        label="Number of Results",
                        scale=1
                    )
            
            # Results display
            results_display = gr.HTML(
                value=handle_query_search("", "hybrid", 5),  # Initial placeholder
                elem_classes=["ranker-results-container"]
            )
            
            # Event handlers
            query_input.submit(
                handle_query_search,
                inputs=[query_input, method_dropdown, k_slider],
                outputs=[results_display]
            )
            
            search_btn.click(
                handle_query_search,
                inputs=[query_input, method_dropdown, k_slider],
                outputs=[results_display]
            )
            
            # Control button handlers
            def clear_ranker_results():
                """Clear the search results and reset to placeholder."""
                return handle_query_search("", "hybrid", 5), ""
            
            def refresh_ranker_status():
                """Refresh the ranker status display."""
                return get_ranker_status()
            
            refresh_ranker_status_btn.click(
                fn=refresh_ranker_status,
                inputs=[],
                outputs=[status_display]
            )
            
            clear_results_btn.click(
                fn=clear_ranker_results,
                inputs=[],
                outputs=[results_display, query_input]
            )
            
            # Update results when method or k changes
            method_dropdown.change(
                fn=handle_query_search,
                inputs=[query_input, method_dropdown, k_slider],
                outputs=[results_display]
            )
            
            k_slider.change(
                fn=handle_query_search,
                inputs=[query_input, method_dropdown, k_slider],
                outputs=[results_display]
            )