Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

ramailkk commited on 21 days ago

Commit

643df21

1 Parent(s): 7513ffe

working first phase

Browse files

Files changed (2) hide show

retriever/processor.py +158 -29
retriever/retriever.py +95 -8

retriever/processor.py CHANGED Viewed

@@ -6,92 +6,159 @@ from langchain_text_splitters import (
 from langchain_experimental.text_splitter import SemanticChunker
 from langchain_huggingface import HuggingFaceEmbeddings
 from sentence_transformers import SentenceTransformer
 class ChunkProcessor:
-    def __init__(self, model_name='all-MiniLM-L6-v2'):
         self.model_name = model_name
         self.encoder = SentenceTransformer(model_name)
         # Required for Semantic Chunking
         self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)
-    def get_splitter(self, technique, chunk_size=500, chunk_overlap=50, **kwargs):
         """
         Factory method to return different chunking strategies.
         """
         if technique == "fixed":
             return CharacterTextSplitter(
                 separator=kwargs.get('separator', ""),
                 chunk_size=chunk_size,
-                chunk_overlap=chunk_overlap
             )
         elif technique == "recursive":
             return RecursiveCharacterTextSplitter(
                 chunk_size=chunk_size,
-                chunk_overlap=chunk_overlap
             )
         elif technique == "character":
             return CharacterTextSplitter(
                 separator=kwargs.get('separator', "\n\n"),
                 chunk_size=chunk_size,
-                chunk_overlap=chunk_overlap
             )
         elif technique == "sentence":
-            # Using Recursive Splitter configured specifically for sentence boundaries
-            # This avoids the Spacy [E050] error while still respecting full sentences.
             return RecursiveCharacterTextSplitter(
                 chunk_size=chunk_size,
                 chunk_overlap=chunk_overlap,
-                separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
             )
         elif technique == "semantic":
             return SemanticChunker(
                 self.hf_embeddings,
-                breakpoint_threshold_type="percentile"
             )
         elif technique == "token":
             return SentenceTransformersTokenTextSplitter(
                 model_name=self.model_name,
                 tokens_per_chunk=chunk_size,
-                chunk_overlap=chunk_overlap
             )
         else:
-            raise ValueError(f"Technique '{technique}' is not supported.")
-    def process(self, df, technique="recursive", chunk_size=500, chunk_overlap=50, **kwargs):
         """
-        Processes a DataFrame into vector-ready chunks with full output for 5 documents.
         """
         splitter = self.get_splitter(technique, chunk_size, chunk_overlap, **kwargs)
         processed_chunks = []
-        # Take the first 5 documents as requested
-        subset_df = df.head(5)
         for _, row in subset_df.iterrows():
-            print(f"\n" + "="*80)
-            print(f"📄 DOCUMENT: {row['title']}")
-            print(f"🔗 URL: {row['url']}")
-            print("-" * 80)
             # Split the text
             raw_chunks = splitter.split_text(row['full_text'])
-            print(f"🎯 Technique: {technique.upper()} | Total Chunks: {len(raw_chunks)}")
             for i, text in enumerate(raw_chunks):
-                # Standardize output
                 content = text.page_content if hasattr(text, 'page_content') else text
-                # Print the full content of every chunk
-                print(f"\n[Chunk {i}] ({len(content)} chars):")
-                print(f"   {content}")
-                # Embedding
                 embedding = self.encoder.encode(content).tolist()
                 processed_chunks.append({
@@ -102,10 +169,72 @@ class ChunkProcessor:
                         "text": content,
                         "url": row['url'],
                         "chunk_index": i,
-                        "technique": technique
                     }
                 })
-            print("="*80)
-        print(f"\n✅ Finished processing 5 documents into {len(processed_chunks)} chunks.")
-        return processed_chunks

 from langchain_experimental.text_splitter import SemanticChunker
 from langchain_huggingface import HuggingFaceEmbeddings
 from sentence_transformers import SentenceTransformer
+from typing import List, Dict, Any, Optional
+import pandas as pd
 class ChunkProcessor:
+    def __init__(self, model_name='all-MiniLM-L6-v2', verbose: bool = True):
         self.model_name = model_name
         self.encoder = SentenceTransformer(model_name)
+        self.verbose = verbose
         # Required for Semantic Chunking
         self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    def _print(self, *args, **kwargs):
+        """Helper method to conditionally print"""
+        if self.verbose:
+            print(*args, **kwargs)
+    def get_splitter(self, technique: str, chunk_size: int = 500, chunk_overlap: int = 50, **kwargs):
         """
         Factory method to return different chunking strategies.
+        Strategies:
+        - "fixed": Simple character-based splitting with empty separator (can split mid-sentence)
+        - "recursive": Recursive character splitting with hierarchical separators (preserves semantics)
+        - "character": Character-based splitting with paragraph separator
+        - "sentence": Recursive splitting optimized for sentence boundaries
+        - "semantic": Embedding-based semantic chunking
+        - "token": Token-based splitting for transformer models
         """
         if technique == "fixed":
+            # FIXED: Simple character-based splitter - WILL split mid-sentence
             return CharacterTextSplitter(
                 separator=kwargs.get('separator', ""),
                 chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=len,
+                is_separator_regex=False
             )
         elif technique == "recursive":
+            # FIXED: Proper recursive splitter with default separators that preserve semantics
+            separators = kwargs.get('separators', ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""])
             return RecursiveCharacterTextSplitter(
                 chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                separators=separators,
+                length_function=len,
+                keep_separator=kwargs.get('keep_separator', True)
             )
         elif technique == "character":
+            # FIXED: Character splitter with paragraph separator
             return CharacterTextSplitter(
                 separator=kwargs.get('separator', "\n\n"),
                 chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=len,
+                is_separator_regex=False
             )
         elif technique == "sentence":
+            # FIXED: Using Recursive Splitter with comprehensive sentence boundaries
+            # This preserves full sentences whenever possible
             return RecursiveCharacterTextSplitter(
                 chunk_size=chunk_size,
                 chunk_overlap=chunk_overlap,
+                separators=kwargs.get('separators', ["\n\n", "\n", ". ", "? ", "! ", ".\n", "?\n", "!\n", "; ", ": ", ", ", " ", ""]),
+                length_function=len,
+                keep_separator=kwargs.get('keep_separator', True)
             )
         elif technique == "semantic":
+            # FIXED: Semantic chunker with proper configuration
             return SemanticChunker(
                 self.hf_embeddings,
+                breakpoint_threshold_type=kwargs.get('breakpoint_threshold_type', "percentile"),
+                breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 95),
+                min_chunk_size=kwargs.get('min_chunk_size', chunk_size // 10),
+                max_chunk_size=kwargs.get('max_chunk_size', chunk_size)
             )
         elif technique == "token":
+            # FIXED: Token-based splitter with proper token counting
             return SentenceTransformersTokenTextSplitter(
                 model_name=self.model_name,
                 tokens_per_chunk=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=kwargs.get('length_function', lambda x: len(self.encoder.encode(x)))
             )
         else:
+            raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, sentence, semantic, token")
+    def process(self, df: pd.DataFrame, technique: str = "recursive", chunk_size: int = 500,
+                chunk_overlap: int = 50, max_docs: Optional[int] = 5, verbose: Optional[bool] = None,
+                **kwargs) -> List[Dict[str, Any]]:
         """
+        Processes a DataFrame into vector-ready chunks with full output for documents.
+        Args:
+            df: DataFrame containing documents with columns: id, title, url, full_text
+            technique: Chunking strategy to use
+            chunk_size: Maximum size of each chunk (characters for most, tokens for token splitter)
+            chunk_overlap: Overlap between consecutive chunks
+            max_docs: Maximum number of documents to process (None for all)
+            verbose: Override the instance's verbose setting (if None, uses instance setting)
+            **kwargs: Additional arguments to pass to splitter
+        Returns:
+            List of processed chunks with embeddings and metadata
         """
+        # Determine if we should print
+        should_print = verbose if verbose is not None else self.verbose
         splitter = self.get_splitter(technique, chunk_size, chunk_overlap, **kwargs)
         processed_chunks = []
+        # Select documents to process
+        if max_docs:
+            subset_df = df.head(max_docs)
+        else:
+            subset_df = df
+        # Validate required columns exist
+        required_cols = ['id', 'title', 'url', 'full_text']
+        missing_cols = [col for col in required_cols if col not in subset_df.columns]
+        if missing_cols:
+            raise ValueError(f"DataFrame missing required columns: {missing_cols}")
         for _, row in subset_df.iterrows():
+            if should_print:
+                self._print("\n" + "="*80)
+                self._print(f"📄 DOCUMENT: {row['title']}")
+                self._print(f"🔗 URL: {row['url']}")
+                self._print(f"📏 Technique: {technique.upper()} | Chunk Size: {chunk_size} | Overlap: {chunk_overlap}")
+                self._print("-" * 80)
             # Split the text
             raw_chunks = splitter.split_text(row['full_text'])
+            if should_print:
+                self._print(f"🎯 Total Chunks Generated: {len(raw_chunks)}")
             for i, text in enumerate(raw_chunks):
+                # Standardize output (handle both string and Document objects)
                 content = text.page_content if hasattr(text, 'page_content') else text
+                if should_print:
+                    # Print chunk preview
+                    self._print(f"\n[Chunk {i}] ({len(content)} chars):")
+                    preview = content[:200] + "..." if len(content) > 200 else content
+                    self._print(f"   {preview}")
+                # Generate embedding
                 embedding = self.encoder.encode(content).tolist()
                 processed_chunks.append({
                         "text": content,
                         "url": row['url'],
                         "chunk_index": i,
+                        "technique": technique,
+                        "chunk_size": len(content),
+                        "total_chunks": len(raw_chunks)
                     }
                 })
+            if should_print:
+                self._print("="*80)
+        if should_print:
+            self._print(f"\n✅ Finished processing {len(subset_df)} documents into {len(processed_chunks)} chunks.")
+            if len(processed_chunks) > 0:
+                self._print(f"📊 Average chunk size: {sum(c['metadata']['chunk_size'] for c in processed_chunks) / len(processed_chunks):.0f} chars")
+        return processed_chunks
+    def compare_strategies(self, df: pd.DataFrame, text_column: str = 'full_text',
+                          chunk_size: int = 500, max_docs: int = 1,
+                          verbose: Optional[bool] = None) -> Dict[str, Any]:
+        """
+        Compare different chunking strategies on the same document.
+        Returns:
+            Dictionary with comparison metrics for each strategy
+        """
+        # Determine if we should print
+        should_print = verbose if verbose is not None else self.verbose
+        strategies = ['fixed', 'recursive', 'character', 'sentence', 'semantic', 'token']
+        results = {}
+        # Get first document
+        sample_text = df.iloc[0][text_column]
+        for technique in strategies:
+            try:
+                if should_print:
+                    self._print(f"\n🔍 Testing {technique.upper()} strategy...")
+                splitter = self.get_splitter(technique, chunk_size=chunk_size)
+                chunks = splitter.split_text(sample_text)
+                # Analyze chunks
+                chunk_lengths = [len(c.page_content if hasattr(c, 'page_content') else c) for c in chunks]
+                avg_chunk_size = sum(chunk_lengths) / len(chunk_lengths) if chunk_lengths else 0
+                # Count how many chunks end with sentence boundaries
+                sentence_enders = ['.', '!', '?', '"', "'"]
+                complete_sentences = sum(1 for c in chunks
+                                        if (c.page_content if hasattr(c, 'page_content') else c).strip()[-1] in sentence_enders)
+                results[technique] = {
+                    'num_chunks': len(chunks),
+                    'avg_chunk_size': avg_chunk_size,
+                    'min_chunk_size': min(chunk_lengths) if chunk_lengths else 0,
+                    'max_chunk_size': max(chunk_lengths) if chunk_lengths else 0,
+                    'complete_sentences_ratio': complete_sentences / len(chunks) if chunks else 0,
+                    'chunk_lengths': chunk_lengths
+                }
+                if should_print:
+                    self._print(f"   ✓ Generated {len(chunks)} chunks, avg size: {avg_chunk_size:.0f} chars")
+            except Exception as e:
+                results[technique] = {'error': str(e)}
+                if should_print:
+                    self._print(f"   ✗ Error: {str(e)}")
+        return results

retriever/retriever.py CHANGED Viewed

@@ -2,21 +2,29 @@ import numpy as np
 from rank_bm25 import BM25Okapi
 from sentence_transformers import CrossEncoder
 from sklearn.metrics.pairwise import cosine_similarity
 class HybridRetriever:
-    def __init__(self, final_chunks, embed_model, rerank_model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'):
         """
         :param final_chunks: The list of chunk dictionaries with metadata.
         :param embed_model: The SentenceTransformer model used for query and chunk embedding.
         """
         self.final_chunks = final_chunks
         self.embed_model = embed_model
         self.rerank_model = CrossEncoder(rerank_model_name)
         # Initialize BM25 corpus
         self.tokenized_corpus = [chunk['metadata']['text'].lower().split() for chunk in final_chunks]
         self.bm25 = BM25Okapi(self.tokenized_corpus)
     def _rrf_score(self, semantic_results, bm25_results, k=60):
         """Reciprocal Rank Fusion (RRF) Implementation."""
         scores = {}
@@ -67,42 +75,121 @@ class HybridRetriever:
         return [chunk_texts[i] for i in selected_indices]
-    def search(self, query, index, top_k=10, final_k=3, mode="hybrid", rerank_strategy="cross-encoder"):
         """
         :param mode: "semantic", "bm25", or "hybrid"
         :param rerank_strategy: "cross-encoder", "rrf", "mmr", or "none"
         """
         semantic_chunks = []
         bm25_chunks = []
         query_vector = None
         # 1. Fetch Candidates
         if mode in ["semantic", "hybrid"]:
             query_vector = self.embed_model.encode(query)
             res = index.query(vector=query_vector.tolist(), top_k=top_k, include_metadata=True)
             semantic_chunks = [match['metadata']['text'] for match in res['matches']]
         if mode in ["bm25", "hybrid"]:
             tokenized_query = query.lower().split()
             bm25_scores = self.bm25.get_scores(tokenized_query)
             top_indices = np.argsort(bm25_scores)[::-1][:top_k]
             bm25_chunks = [self.final_chunks[i]['metadata']['text'] for i in top_indices]
         # 2. Re-Ranking / Fusion
         if mode == "hybrid" and rerank_strategy == "rrf":
-            return self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
         # Standard combination for other methods
         combined = list(dict.fromkeys(semantic_chunks + bm25_chunks)) # Deduplicate keep order
         if rerank_strategy == "cross-encoder" and combined:
             pairs = [[query, chunk] for chunk in combined]
             scores = self.rerank_model.predict(pairs)
             results = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
-            return [res[0] for res in results[:final_k]]
         elif rerank_strategy == "mmr" and combined:
-            if query_vector is None: query_vector = self.embed_model.encode(query)
-            return self._maximal_marginal_relevance(query_vector, combined, top_k=final_k)
-        return combined[:final_k]

 from rank_bm25 import BM25Okapi
 from sentence_transformers import CrossEncoder
 from sklearn.metrics.pairwise import cosine_similarity
+from typing import Optional
 class HybridRetriever:
+    def __init__(self, final_chunks, embed_model, rerank_model_name='cross-encoder/ms-marco-MiniLM-L-6-v2', verbose: bool = True):
         """
         :param final_chunks: The list of chunk dictionaries with metadata.
         :param embed_model: The SentenceTransformer model used for query and chunk embedding.
+        :param verbose: Whether to print retrieval details and final results.
         """
         self.final_chunks = final_chunks
         self.embed_model = embed_model
         self.rerank_model = CrossEncoder(rerank_model_name)
+        self.verbose = verbose
         # Initialize BM25 corpus
         self.tokenized_corpus = [chunk['metadata']['text'].lower().split() for chunk in final_chunks]
         self.bm25 = BM25Okapi(self.tokenized_corpus)
+    def _print(self, *args, **kwargs):
+        """Helper method to conditionally print"""
+        if self.verbose:
+            print(*args, **kwargs)
     def _rrf_score(self, semantic_results, bm25_results, k=60):
         """Reciprocal Rank Fusion (RRF) Implementation."""
         scores = {}
         return [chunk_texts[i] for i in selected_indices]
+    def search(self, query, index, top_k=10, final_k=3, mode="hybrid", rerank_strategy="cross-encoder",
+               verbose: Optional[bool] = None):
         """
         :param mode: "semantic", "bm25", or "hybrid"
         :param rerank_strategy: "cross-encoder", "rrf", "mmr", or "none"
+        :param verbose: Override the instance's verbose setting (if None, uses instance setting)
         """
+        # Determine if we should print
+        should_print = verbose if verbose is not None else self.verbose
+        if should_print:
+            self._print("\n" + "="*80)
+            self._print(f"🔍 SEARCH QUERY: {query}")
+            self._print(f"📊 Mode: {mode.upper()} | Rerank: {rerank_strategy.upper()}")
+            self._print(f"🎯 Top-K: {top_k} | Final-K: {final_k}")
+            self._print("-" * 80)
         semantic_chunks = []
         bm25_chunks = []
         query_vector = None
         # 1. Fetch Candidates
         if mode in ["semantic", "hybrid"]:
+            if should_print:
+                self._print(f"📚 Semantic Search: Retrieving top {top_k} candidates...")
             query_vector = self.embed_model.encode(query)
             res = index.query(vector=query_vector.tolist(), top_k=top_k, include_metadata=True)
             semantic_chunks = [match['metadata']['text'] for match in res['matches']]
+            if should_print:
+                self._print(f"   ✓ Retrieved {len(semantic_chunks)} semantic candidates")
+                for i, chunk in enumerate(semantic_chunks[:3]):  # Show first 3
+                    preview = chunk[:100] + "..." if len(chunk) > 100 else chunk
+                    self._print(f"      [{i}] {preview}")
         if mode in ["bm25", "hybrid"]:
+            if should_print:
+                self._print(f"📚 BM25 Search: Retrieving top {top_k} candidates...")
             tokenized_query = query.lower().split()
             bm25_scores = self.bm25.get_scores(tokenized_query)
             top_indices = np.argsort(bm25_scores)[::-1][:top_k]
             bm25_chunks = [self.final_chunks[i]['metadata']['text'] for i in top_indices]
+            if should_print:
+                self._print(f"   ✓ Retrieved {len(bm25_chunks)} BM25 candidates")
+                for i, chunk in enumerate(bm25_chunks[:3]):  # Show first 3
+                    preview = chunk[:100] + "..." if len(chunk) > 100 else chunk
+                    self._print(f"      [{i}] {preview}")
         # 2. Re-Ranking / Fusion
         if mode == "hybrid" and rerank_strategy == "rrf":
+            if should_print:
+                self._print(f"🔄 Applying Reciprocal Rank Fusion (RRF)...")
+            results = self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
+            if should_print:
+                self._print(f"✅ Final {final_k} Results:")
+                for i, chunk in enumerate(results):
+                    preview = chunk[:150] + "..." if len(chunk) > 150 else chunk
+                    self._print(f"   [{i+1}] {preview}")
+                self._print("="*80)
+            return results
         # Standard combination for other methods
         combined = list(dict.fromkeys(semantic_chunks + bm25_chunks)) # Deduplicate keep order
+        if should_print:
+            self._print(f"🔄 Combined unique candidates: {len(combined)}")
+            self._print(f"🔄 Applying {rerank_strategy.upper()} reranking...")
         if rerank_strategy == "cross-encoder" and combined:
             pairs = [[query, chunk] for chunk in combined]
             scores = self.rerank_model.predict(pairs)
             results = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
+            results = [res[0] for res in results[:final_k]]
+            if should_print:
+                self._print(f"\n✅ Final {final_k} Results (Cross-Encoder Reranked):")
+                for i, chunk in enumerate(results):
+                    preview = chunk[:150] + "..." if len(chunk) > 150 else chunk
+                    self._print(f"   [{i+1}] {preview}")
+                self._print("="*80)
+            return results
         elif rerank_strategy == "mmr" and combined:
+            if should_print:
+                self._print(f"   Using MMR with λ=0.5 to balance relevance and diversity")
+            if query_vector is None:
+                query_vector = self.embed_model.encode(query)
+            results = self._maximal_marginal_relevance(query_vector, combined, top_k=final_k)
+            if should_print:
+                self._print(f"\n✅ Final {final_k} Results (MMR Reranked):")
+                for i, chunk in enumerate(results):
+                    preview = chunk[:150] + "..." if len(chunk) > 150 else chunk
+                    self._print(f"   [{i+1}] {preview}")
+                self._print("="*80)
+            return results
+        else:  # "none" or fallback
+            results = combined[:final_k]
+            if should_print:
+                self._print(f"\n✅ Final {final_k} Results (No Reranking):")
+                for i, chunk in enumerate(results):
+                    preview = chunk[:150] + "..." if len(chunk) > 150 else chunk
+                    self._print(f"   [{i+1}] {preview}")
+                self._print("="*80)
+            return results