import streamlit as st import numpy as np from backend.pinecone_storage import initialize_pinecone def retrieve_chunks(query_embedding, query_text=None, namespace="default", top_k=5): """ Retrieve relevant chunks using vector similarity search. Args: query_embedding: Query embedding vector query_text: Optional query text for hybrid search/reranking namespace: Pinecone namespace top_k: Number of results to return Returns: chunks: List of relevant text chunks with metadata """ try: # Initialize Pinecone index index = initialize_pinecone() if not index: st.error("Failed to initialize Pinecone for retrieval") return [] # Perform vector search search_results = index.query( namespace=namespace, vector=query_embedding, top_k=top_k, include_metadata=True ) # Extract and format results chunks = [] for match in search_results.matches: # Skip results with no metadata if not hasattr(match, 'metadata') or not match.metadata: continue metadata = match.metadata # Ensure text field exists if 'text' not in metadata: continue # Create chunk object chunk = { 'text': metadata['text'], 'metadata': metadata, 'score': match.score } # Store source information for attribution if 'sources_used' not in st.session_state: st.session_state['sources_used'] = [] if 'filename' in metadata and 'chunk_index' in metadata: source_info = (metadata['filename'], metadata['chunk_index']) if source_info not in st.session_state['sources_used']: st.session_state['sources_used'].append(source_info) chunks.append(chunk) # Print the number of chunks retrieved for debugging if len(chunks) > 0: st.success(f"Retrieved {len(chunks)} relevant chunks from document.") else: st.warning("No relevant chunks found in the document.") # Store debug info if 'debug_info' not in st.session_state: st.session_state['debug_info'] = {} st.session_state['debug_info']['retrieval'] = { 'query_text': query_text, 'top_k': top_k, 'num_results': len(chunks), 'namespace': namespace } return chunks except Exception as e: st.error(f"Error retrieving chunks: {str(e)}") return []