Spaces:

Qar-Raz
/

NLP-RAG

Runtime error

App Files Files Community

Ramail Khan commited on 3 days ago

Commit

7b44ae2

unverified ·

2 Parent(s): 74dd328 6c2979e

Merge pull request #4 from ramailkk/Changed_Ingestion_Logic

Browse files

Files changed (10) hide show

BACKEND_README.md +61 -0
EntireBookCleaned.txt +0 -0
api.py +1 -1
config.yaml +13 -13
data_loader.py +86 -36
ingest.py +221 -0
main.py +115 -25
requirements.txt +1 -0
retriever/processor.py +23 -1
test_backend.py +172 -0

BACKEND_README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# CBT RAG System - Backend Documentation
+## Overview
+This is a Retrieval-Augmented Generation (RAG) system for Cognitive Behavioral Therapy (CBT) content. It uses a CBT textbook as the knowledge base and implements hybrid search with re-ranking for accurate question answering.
+## Embedding Model: Jina v2 Small
+**Model**: `jinaai/jina-embeddings-v2-small-en`
+| Property   | Value  |
+| ---------- | ------ |
+| Dimensions | 512    |
+| Max Tokens | 8,192  |
+| Parameters | 33M    |
+| Disk Size  | ~130MB |
+## Ingestion: 6 Chunking Techniques (Single Index)
+The `ingest.py` script processes the CBT book **6 times** with different chunking strategies for the ablation study. All chunks are stored in a **SINGLE Pinecone index** with metadata to differentiate.
+| #   | Technique | Description                                                                          |
+| --- | --------- | ------------------------------------------------------------------------------------ |
+| 1   | fixed     | Fixed-size chunking - splits every N characters (may cut sentences mid-way)          |
+| 2   | sentence  | Sentence-level chunking - respects sentence boundaries (NLTK)                        |
+| 3   | paragraph | Paragraph-level chunking - uses natural paragraph breaks                             |
+| 4   | semantic  | Semantic chunking - splits where topic/meaning shifts (embedding similarity)         |
+| 5   | recursive | Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars) |
+| 6   | page      | Page-level chunking - uses entire book pages as-is                                   |
+**Single Pinecone Index:** `cbt-book-recursive`
+**Metadata:** Each chunk includes `chunking_technique` field for filtering.
+## Configuration (`config.yaml`)
+```yaml
+processing:
+  embedding_model: "jinaai/jina-embeddings-v2-small-en"
+  technique: "recursive"
+  chunk_size: 1000
+  chunk_overlap: 100
+vector_db:
+  base_index_name: "cbt-book"
+  dimension: 512
+  metric: "cosine"
+```
+## Running the System
+```bash
+source venv/bin/activate
+pip install -r requirements.txt
+python ingest.py
+python -m uvicorn api:app --host 0.0.0.0 --port 8000
+```
+## Pinecone Setup
+Create index `cbt-book-recursive` with 512 dimensions, cosine metric.

EntireBookCleaned.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

api.py CHANGED Viewed

@@ -208,7 +208,7 @@ def startup_event() -> None:
     if not hf_token:
         raise RuntimeError("HF_TOKEN not found in environment variables")
-    index_name = "arxiv-tournament-recursive"
     embed_model_name = "all-MiniLM-L6-v2"
     project_root = os.path.dirname(os.path.abspath(__file__))
     cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))

     if not hf_token:
         raise RuntimeError("HF_TOKEN not found in environment variables")
+    index_name = "cbt-book-recursive"
     embed_model_name = "all-MiniLM-L6-v2"
     project_root = os.path.dirname(os.path.abspath(__file__))
     cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))

config.yaml CHANGED Viewed

@@ -1,26 +1,26 @@
 # ------------------------------------------------------------------
-# RAG TOURNAMENT CONFIGURATION
 # ------------------------------------------------------------------
 project:
-  name: "arxiv-research-rag"
-  category: "cs.AI"
-  doc_limit: 5
 processing:
   # Embedding model used for both vector db and evaluator similarity
-  embedding_model: "all-MiniLM-L6-v2"
   # Options: sentence, recursive, semantic, fixed
-  technique: "recursive"
-  # Token limit for MiniLM is 256; keeping it at 250 for safety
-  chunk_size: 500
-  chunk_overlap: 50
 vector_db:
-  base_index_name: "arxiv-tournament"
-  dimension: 384
   metric: "cosine"
-  batch_size: 100
 retrieval:
   # Options: hybrid, semantic, bm25
@@ -43,4 +43,4 @@ models:
   - "Mistral-7B"
   - "Qwen-2.5"
   - "DeepSeek-V3"
-  - "TinyAya"

 # ------------------------------------------------------------------
+# RAG CBT QUESTION-ANSWERING SYSTEM CONFIGURATION
 # ------------------------------------------------------------------
 project:
+  name: "cbt-rag-system"
+  category: "psychology"
+  doc_limit: null # Load all pages from the book
 processing:
   # Embedding model used for both vector db and evaluator similarity
+  embedding_model: "jinaai/jina-embeddings-v2-small-en"
   # Options: sentence, recursive, semantic, fixed
+  technique: "recursive"
+  # Jina supports 8192 tokens (~32k chars), using 1000 chars for better context
+  chunk_size: 1000
+  chunk_overlap: 100
 vector_db:
+  base_index_name: "cbt-book"
+  dimension: 512 # Jina outputs 512 dimensions
   metric: "cosine"
+  batch_size: 50 # Reduced batch size for CPU processing
 retrieval:
   # Options: hybrid, semantic, bm25
   - "Mistral-7B"
   - "Qwen-2.5"
   - "DeepSeek-V3"
+  - "TinyAya"

data_loader.py CHANGED Viewed

@@ -1,41 +1,91 @@
-import fitz  # PyMuPDF
-import requests
-import io
-import arxiv
 import pandas as pd
-def extract_text_from_url(pdf_url):
-    """Downloads a PDF and extracts all text."""
     try:
-        response = requests.get(pdf_url)
-        # Open the PDF directly from the byte stream
-        with fitz.open(stream=io.BytesIO(response.content), filetype="pdf") as doc:
-            text = ""
-            for page in doc:
-                text += page.get_text()
-            return text.replace('\n', ' ')
-    except Exception as e:
-        print(f"Error downloading {pdf_url}: {e}")
-        return ""
-def fetch_arxiv_data(category="cs.AI", limit=5):
-    client = arxiv.Client()
-    search = arxiv.Search(
-        query=f"cat:{category}",
-        max_results=limit,
-        sort_by=arxiv.SortCriterion.SubmittedDate
-    )
-    results = []
-    for r in client.results(search):
-        print(f"Downloading full text for: {r.title[:50]}...")
-        full_text = extract_text_from_url(r.pdf_url)
-        results.append({
-            "id": r.entry_id.split('/')[-1],
-            "title": r.title,
-            "abstract": r.summary.replace('\n', ' '),
-            "full_text": full_text, # <--- Main part of the data
-            "url": r.pdf_url
-        })
-    return pd.DataFrame(results)

+import re
 import pandas as pd
+from typing import List, Dict, Any
+def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame:
+    """
+    Loads the CBT book from a text file and parses it into documents.
+    Each page is treated as a separate document.
+    Args:
+        file_path: Path to the cleaned book text file
+    Returns:
+        DataFrame with columns: id, title, url, full_text
+    """
     try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Book file not found: {file_path}")
+    # Split content by page markers
+    # Pattern matches "--- Page X ---" or "--- Page X of Y ---"
+    page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---'
+    # Split the content into pages
+    pages = re.split(page_pattern, content)
+    # pages[0] is content before first page marker (usually empty)
+    # Then alternating: page_number, page_content, page_number, page_content...
+    documents = []
+    i = 1  # Start from first page number
+    while i < len(pages) - 1:
+        page_num = pages[i].strip()
+        page_content = pages[i + 1].strip() if i + 1 < len(pages) else ""
+        # Clean up the content - remove excessive whitespace
+        page_content = re.sub(r'\n{3,}', '\n\n', page_content)
+        page_content = page_content.strip()
+        if page_content:  # Only add non-empty pages
+            # Extract a title from the first line if possible
+            lines = page_content.split('\n')
+            title_line = lines[0].strip() if lines else f"Page {page_num}"
+            # Use first meaningful line as title, or default to page number
+            if len(title_line) > 10 and len(title_line) < 200:
+                title = title_line
+            else:
+                title = f"CBT Book - Page {page_num}"
+            documents.append({
+                "id": f"cbt-page-{page_num}",
+                "title": title,
+                "url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}",
+                "full_text": page_content
+            })
+        i += 2  # Move to next page number
+    if not documents:
+        raise ValueError("No documents were parsed from the book file")
+    df = pd.DataFrame(documents)
+    print(f"Loaded {len(df)} pages from CBT book")
+    return df
+def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Get statistics about the loaded book.
+    Args:
+        df: DataFrame containing book pages
+    Returns:
+        Dictionary with statistics
+    """
+    total_chars = df['full_text'].str.len().sum()
+    avg_chars = df['full_text'].str.len().mean()
+    return {
+        "total_pages": len(df),
+        "total_characters": total_chars,
+        "average_chars_per_page": round(avg_chars, 2),
+        "min_chars": df['full_text'].str.len().min(),
+        "max_chars": df['full_text'].str.len().max()
+    }

ingest.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+Script to ingest CBT book data into Pinecone vector database.
+Ingests the book 6 times with different chunking formats for ablation study.
+All chunks are stored in a SINGLE index with metadata to differentiate.
+Run this once before starting the API server.
+"""
+import os
+import time
+from dotenv import load_dotenv
+from config_loader import cfg
+from data_loader import load_cbt_book, get_book_stats
+from vector_db import get_pinecone_index, refresh_pinecone_index
+from retriever.processor import ChunkProcessor
+# 6 different chunking techniques for ablation study
+CHUNKING_TECHNIQUES = [
+    {
+        "name": "fixed",
+        "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
+        "chunk_size": 1000,
+        "chunk_overlap": 100,
+        "kwargs": {"separator": ""},  # No separator for fixed splitting
+    },
+    {
+        "name": "sentence",
+        "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
+        "chunk_size": 2400,
+        "chunk_overlap": 100,
+        "kwargs": {},
+    },
+    {
+        "name": "paragraph",
+        "description": "Paragraph-level chunking - uses natural paragraph breaks",
+        "chunk_size": 4000,
+        "chunk_overlap": 100,
+        "kwargs": {"separator": "\n\n"},  # Split on paragraph breaks
+    },
+    {
+        "name": "semantic",
+        "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
+        "chunk_size": 2000,
+        "chunk_overlap": 100,
+        "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
+    },
+    {
+        "name": "recursive",
+        "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
+        "chunk_size": 2000,
+        "chunk_overlap": 100,
+        "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
+    },
+    {
+        "name": "page",
+        "description": "Page-level chunking - uses entire book pages as-is",
+        "chunk_size": 10000,  # Very large to keep full pages
+        "chunk_overlap": 0,   # No overlap between pages
+        "kwargs": {"separator": "--- Page"},  # Split on page markers
+    },
+]
+def ingest_single_technique(
+    raw_data,
+    proc,
+    technique_config,
+    technique_index,
+    total_techniques,
+):
+    """Chunk the book using a single technique and return chunks with metadata."""
+    technique_name = technique_config["name"]
+    chunk_size = technique_config["chunk_size"]
+    chunk_overlap = technique_config["chunk_overlap"]
+    kwargs = technique_config.get("kwargs", {})
+    print(f"\n[{technique_index}/{total_techniques}] Processing '{technique_name}'...")
+    print(f"  Description: {technique_config['description']}")
+    print(f"  Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
+    # Chunk and embed
+    final_chunks = proc.process(
+        raw_data,
+        technique=technique_name,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        max_docs=cfg.project.get("doc_limit"),
+        verbose=False,
+        **kwargs,
+    )
+    # Add technique metadata to each chunk for differentiation
+    # Prefix ID with technique name to ensure uniqueness across techniques
+    for chunk in final_chunks:
+        chunk["metadata"]["chunking_technique"] = technique_name
+        chunk["id"] = f"{technique_name}-{chunk['id']}"
+    print(f"  Created {len(final_chunks)} chunks")
+    return final_chunks
+def ingest_data():
+    """Load CBT book, chunk it 6 ways, and upload ALL to a SINGLE Pinecone index."""
+    load_dotenv()
+    pinecone_key = os.getenv("PINECONE_API_KEY")
+    if not pinecone_key:
+        raise RuntimeError("PINECONE_API_KEY not found in environment variables")
+    print("=" * 80)
+    print("CBT BOOK INGESTION PIPELINE - 6 TECHNIQUES (SINGLE INDEX)")
+    print("=" * 80)
+    print(f"\nTechniques to process: {len(CHUNKING_TECHNIQUES)}")
+    for i, tech in enumerate(CHUNKING_TECHNIQUES, 1):
+        print(f"  {i}. {tech['name']}: {tech['description']}")
+    print(f"\nAll chunks will be stored in a SINGLE index: {cfg.db['base_index_name']}-{cfg.processing['technique']}")
+    print("Chunks are differentiated by 'chunking_technique' metadata field.")
+    # 1. Load the CBT book (once, reused for all techniques)
+    print(f"\n{'='*80}")
+    print("STEP 1: LOADING CBT BOOK")
+    print(f"{'='*80}")
+    print("\nLoading CBT book from EntireBookCleaned.txt...")
+    raw_data = load_cbt_book("EntireBookCleaned.txt")
+    stats = get_book_stats(raw_data)
+    print(f"  Loaded {stats['total_pages']} pages")
+    print(f"  Total characters: {stats['total_characters']:,}")
+    print(f"  Average chars per page: {stats['average_chars_per_page']:.0f}")
+    # 2. Initialize processor (once, reused for all techniques)
+    print(f"\nInitializing embedding model: {cfg.processing['embedding_model']}")
+    proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
+    # 3. Process each technique sequentially and collect all chunks
+    print(f"\n{'='*80}")
+    print("STEP 2: CHUNKING WITH 6 TECHNIQUES")
+    print(f"{'='*80}")
+    all_chunks = []
+    results = {}
+    for i, technique in enumerate(CHUNKING_TECHNIQUES, 1):
+        try:
+            chunks = ingest_single_technique(
+                raw_data=raw_data,
+                proc=proc,
+                technique_config=technique,
+                technique_index=i,
+                total_techniques=len(CHUNKING_TECHNIQUES),
+            )
+            all_chunks.extend(chunks)
+            results[technique["name"]] = {
+                "status": "success",
+                "chunks": len(chunks),
+            }
+            # Wait between techniques to avoid rate limits (for embedding API)
+            if i < len(CHUNKING_TECHNIQUES):
+                print(f"  Waiting 5 seconds before next technique (rate limit protection)...")
+                time.sleep(5)
+        except Exception as e:
+            print(f"  ERROR with technique '{technique['name']}': {e}")
+            results[technique["name"]] = {
+                "status": "failed",
+                "error": str(e),
+            }
+    # 4. Upload ALL chunks to a SINGLE Pinecone index
+    print(f"\n{'='*80}")
+    print("STEP 3: UPLOADING TO SINGLE PINECONE INDEX")
+    print(f"{'='*80}")
+    index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
+    print(f"\nIndex name: {index_name}")
+    print(f"Dimension: {cfg.db['dimension']}")
+    print(f"Metric: {cfg.db['metric']}")
+    print(f"Total chunks to upload: {len(all_chunks)}")
+    index = get_pinecone_index(
+        pinecone_key,
+        cfg.db['base_index_name'],
+        technique=cfg.processing['technique'],
+        dimension=cfg.db['dimension'],
+        metric=cfg.db['metric'],
+    )
+    print(f"\nUploading {len(all_chunks)} vectors to Pinecone...")
+    refresh_pinecone_index(index, all_chunks, batch_size=cfg.db['batch_size'])
+    # 5. Summary
+    print(f"\n{'='*80}")
+    print("INGESTION COMPLETE - SUMMARY")
+    print(f"{'='*80}")
+    print(f"\n{'Technique':<15} {'Status':<12} {'Chunks':<10}")
+    print("-" * 40)
+    total_chunks = 0
+    for tech in CHUNKING_TECHNIQUES:
+        name = tech["name"]
+        result = results.get(name, {})
+        status = result.get("status", "unknown")
+        chunks = result.get("chunks", 0)
+        if status == "success":
+            total_chunks += chunks
+        print(f"{name:<15} {status:<12} {chunks:<10}")
+    print("-" * 40)
+    print(f"{'TOTAL':<15} {'':<12} {total_chunks:<10}")
+    print(f"\nSingle index: {index_name}")
+    print(f"Total vectors: {len(all_chunks)}")
+    print("\nChunks can be filtered by 'chunking_technique' metadata field:")
+    for tech in CHUNKING_TECHNIQUES:
+        if results.get(tech["name"], {}).get("status") == "success":
+            print(f"  - chunking_technique: '{tech['name']}'")
+    print("\nYou can now start the API server with:")
+    print("  python -m uvicorn api:app --host 0.0.0.0 --port 8000")
+if __name__ == "__main__":
+    ingest_data()

main.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import os
 from dotenv import load_dotenv
-from config_loader import cfg # Import the Mother Config
 from vector_db import get_pinecone_index, refresh_pinecone_index
 from retriever.retriever import HybridRetriever
 from retriever.generator import RAGGenerator
 from retriever.processor import ChunkProcessor
 from retriever.evaluator import RAGEvaluator
-import data_loader as dl
-# Import fleet mapping
 from models.llama_3_8b import Llama3_8B
 from models.mistral_7b import Mistral_7b
 from models.qwen_2_5 import Qwen2_5
@@ -26,16 +26,36 @@ MODEL_MAP = {
 load_dotenv()
 def main():
     hf_token = os.getenv("HF_TOKEN")
     pinecone_key = os.getenv("PINECONE_API_KEY")
-    query = "How do transformers handle long sequences?"
-    # 1. Data Ingestion (Controlled by Config)
-    raw_data = dl.fetch_arxiv_data(
-        category=cfg.project['category'],
-        limit=cfg.project['doc_limit']
-    )
     # 2. Chunking & Embedding
     proc = ChunkProcessor(model_name=cfg.processing['embedding_model'])
@@ -43,20 +63,38 @@ def main():
         raw_data,
         technique=cfg.processing['technique'],
         chunk_size=cfg.processing['chunk_size'],
-        chunk_overlap=cfg.processing['chunk_overlap']
     )
-    # 3. Vector DB (Auto-names index based on technique)
     index = get_pinecone_index(
-        pinecone_key,
-        cfg.db['base_index_name'],
         technique=cfg.processing['technique'],
         dimension=cfg.db['dimension']
     )
     refresh_pinecone_index(index, final_chunks, batch_size=cfg.db['batch_size'])
-    # 4. Retrieval
     retriever = HybridRetriever(final_chunks, proc.encoder)
     context_chunks = retriever.search(
         query, index,
         mode=cfg.retrieval['mode'],
@@ -66,44 +104,96 @@ def main():
         final_k=cfg.retrieval['final_k']
     )
-    # 5. Initialization of Contestants
     rag_engine = RAGGenerator()
     models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
-    # Setup Evaluator with the designated Judge
     evaluator = RAGEvaluator(
-    judge_model=cfg.gen['judge_model'],
-    embedding_model=proc.encoder,
-    api_key=os.getenv("GROQ_API_KEY")
     )
     tournament_results = {}
-    # 6. Tournament Loop
     for name, model_inst in models.items():
-        print(f"\n--- Processing {name} ---")
         try:
             # Generation
             answer = rag_engine.get_answer(
-                model_inst, query, context_chunks,
                 temperature=cfg.gen['temperature']
             )
             # Faithfulness Evaluation
             faith = evaluator.evaluate_faithfulness(answer, context_chunks)
             # Relevancy Evaluation
             rel = evaluator.evaluate_relevancy(query, answer)
             tournament_results[name] = {
                 "Faithfulness": faith['score'],
                 "Relevancy": rel['score'],
                 "Claims": faith['details']
             }
         except Exception as e:
             print(f"Error evaluating {name}: {e}")
-    # 7. Final Output (Omitted for brevity, use your existing report logic)
 if __name__ == "__main__":
     main()

 import os
 from dotenv import load_dotenv
+from config_loader import cfg
 from vector_db import get_pinecone_index, refresh_pinecone_index
 from retriever.retriever import HybridRetriever
 from retriever.generator import RAGGenerator
 from retriever.processor import ChunkProcessor
 from retriever.evaluator import RAGEvaluator
+from data_loader import load_cbt_book, get_book_stats
+# Import model fleet
 from models.llama_3_8b import Llama3_8B
 from models.mistral_7b import Mistral_7b
 from models.qwen_2_5 import Qwen2_5
 load_dotenv()
 def main():
+    """Main function to run the RAG tournament on CBT book."""
     hf_token = os.getenv("HF_TOKEN")
     pinecone_key = os.getenv("PINECONE_API_KEY")
+    groq_key = os.getenv("GROQ_API_KEY")
+    # Verify environment variables
+    if not hf_token:
+        raise RuntimeError("HF_TOKEN not found in environment variables")
+    if not pinecone_key:
+        raise RuntimeError("PINECONE_API_KEY not found in environment variables")
+    if not groq_key:
+        raise RuntimeError("GROQ_API_KEY not found in environment variables")
+    # Example query for testing
+    query = "What is cognitive behavior therapy and how does it work?"
+    print("=" * 80)
+    print("CBT RAG SYSTEM - LOADING DATA")
+    print("=" * 80)
+    # 1. Data Ingestion - Load CBT Book
+    raw_data = load_cbt_book("EntireBookCleaned.txt")
+    stats = get_book_stats(raw_data)
+    print(f"Book Statistics: {stats}")
+    print("\n" + "=" * 80)
+    print("CHUNKING AND EMBEDDING")
+    print("=" * 80)
     # 2. Chunking & Embedding
     proc = ChunkProcessor(model_name=cfg.processing['embedding_model'])
         raw_data,
         technique=cfg.processing['technique'],
         chunk_size=cfg.processing['chunk_size'],
+        chunk_overlap=cfg.processing['chunk_overlap'],
+        max_docs=cfg.project.get('doc_limit'),  # None means load all
+        verbose=True
     )
+    print(f"\nTotal chunks created: {len(final_chunks)}")
+    print("\n" + "=" * 80)
+    print("VECTOR DATABASE SETUP")
+    print("=" * 80)
+    # 3. Vector DB - Create/Update Pinecone Index
     index = get_pinecone_index(
+        pinecone_key,
+        cfg.db['base_index_name'],
         technique=cfg.processing['technique'],
         dimension=cfg.db['dimension']
     )
     refresh_pinecone_index(index, final_chunks, batch_size=cfg.db['batch_size'])
+    print("\n" + "=" * 80)
+    print("RETRIEVAL SETUP")
+    print("=" * 80)
+    # 4. Retrieval Setup
     retriever = HybridRetriever(final_chunks, proc.encoder)
+    print("\n" + "=" * 80)
+    print(f"TESTING QUERY: {query}")
+    print("=" * 80)
+    # Test retrieval
     context_chunks = retriever.search(
         query, index,
         mode=cfg.retrieval['mode'],
         final_k=cfg.retrieval['final_k']
     )
+    print(f"\nRetrieved {len(context_chunks)} context chunks")
+    print("\n" + "=" * 80)
+    print("MODEL TOURNAMENT")
+    print("=" * 80)
+    # 5. Initialize Models
     rag_engine = RAGGenerator()
     models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
+    # 6. Setup Evaluator with Judge
     evaluator = RAGEvaluator(
+        judge_model=cfg.gen['judge_model'],
+        embedding_model=proc.encoder,
+        api_key=groq_key
     )
     tournament_results = {}
+    # 7. Tournament Loop
     for name, model_inst in models.items():
+        print(f"\n{'='*60}")
+        print(f"Processing {name}")
+        print('='*60)
         try:
             # Generation
             answer = rag_engine.get_answer(
+                model_inst, query, context_chunks,
                 temperature=cfg.gen['temperature']
             )
+            print(f"\nAnswer from {name}:")
+            print(answer[:500] + "..." if len(answer) > 500 else answer)
             # Faithfulness Evaluation
             faith = evaluator.evaluate_faithfulness(answer, context_chunks)
             # Relevancy Evaluation
             rel = evaluator.evaluate_relevancy(query, answer)
             tournament_results[name] = {
+                "answer": answer,
                 "Faithfulness": faith['score'],
                 "Relevancy": rel['score'],
                 "Claims": faith['details']
             }
+            print(f"\n{name} Results:")
+            print(f"  Faithfulness: {faith['score']:.1f}%")
+            print(f"  Relevancy: {rel['score']:.3f}")
         except Exception as e:
             print(f"Error evaluating {name}: {e}")
+            tournament_results[name] = {
+                "answer": "",
+                "Faithfulness": 0,
+                "Relevancy": 0,
+                "Claims": [],
+                "error": str(e)
+            }
+    # 8. Final Results Summary
+    print("\n" + "=" * 80)
+    print("TOURNAMENT RESULTS SUMMARY")
+    print("=" * 80)
+    print(f"\nQuery: {query}")
+    print(f"\nRetrieved Context Chunks: {len(context_chunks)}")
+    print("\n" + "-" * 60)
+    print(f"{'Model':<20} {'Faithfulness':>15} {'Relevancy':>15}")
+    print("-" * 60)
+    for name, results in tournament_results.items():
+        faith = results.get('Faithfulness', 0)
+        rel = results.get('Relevancy', 0)
+        print(f"{name:<20} {faith:>14.1f}% {rel:>15.3f}")
+    print("-" * 60)
+    # Find best model
+    if tournament_results:
+        best_model = max(
+            tournament_results.items(),
+            key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
+        )
+        print(f"\nBest Overall Model: {best_model[0]}")
+        print(f"  Faithfulness: {best_model[1]['Faithfulness']:.1f}%")
+        print(f"  Relevancy: {best_model[1]['Relevancy']:.3f}")
+    return tournament_results
 if __name__ == "__main__":
     main()

requirements.txt CHANGED Viewed

@@ -93,3 +93,4 @@ uuid_utils==0.14.1
 xxhash==3.6.0
 yarl==1.23.0
 zstandard==0.25.0

 xxhash==3.6.0
 yarl==1.23.0
 zstandard==0.25.0
+groq==0.13.0

retriever/processor.py CHANGED Viewed

@@ -37,8 +37,10 @@ class ChunkProcessor:
         - "fixed":     Character-based, may split mid-sentence
         - "recursive": Recursive character splitting with hierarchical separators
         - "character": Character-based splitting on paragraph boundaries
         - "sentence":  Sliding window over NLTK sentences
         - "semantic":  Embedding-based semantic chunking
         """
         if technique == "fixed":
             return CharacterTextSplitter(
@@ -67,6 +69,16 @@ class ChunkProcessor:
                 is_separator_regex=False
             )
         elif technique == "sentence":
             # sentence-level chunking using NLTK
             return NLTKTextSplitter(
@@ -83,8 +95,18 @@ class ChunkProcessor:
                 breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
             )
         else:
-            raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, sentence, semantic")
     # ------------------------------------------------------------------
     # Processing

         - "fixed":     Character-based, may split mid-sentence
         - "recursive": Recursive character splitting with hierarchical separators
         - "character": Character-based splitting on paragraph boundaries
+        - "paragraph": Paragraph-level splitting on \\n\\n boundaries
         - "sentence":  Sliding window over NLTK sentences
         - "semantic":  Embedding-based semantic chunking
+        - "page":      Page-level splitting on page markers
         """
         if technique == "fixed":
             return CharacterTextSplitter(
                 is_separator_regex=False
             )
+        elif technique == "paragraph":
+            # Paragraph-level chunking using paragraph breaks
+            return CharacterTextSplitter(
+                separator=kwargs.get('separator', "\n\n"),
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=len,
+                is_separator_regex=False
+            )
         elif technique == "sentence":
             # sentence-level chunking using NLTK
             return NLTKTextSplitter(
                 breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
             )
+        elif technique == "page":
+            # Page-level chunking using page markers
+            return CharacterTextSplitter(
+                separator=kwargs.get('separator', "--- Page"),
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=len,
+                is_separator_regex=False
+            )
         else:
+            raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, paragraph, sentence, semantic, page")
     # ------------------------------------------------------------------
     # Processing

test_backend.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Simple test script to verify backend components work correctly.
+"""
+import os
+import sys
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+def test_data_loader():
+    """Test the data loader module."""
+    print("=" * 60)
+    print("Testing Data Loader...")
+    print("=" * 60)
+    from data_loader import load_cbt_book, get_book_stats
+    df = load_cbt_book("EntireBookCleaned.txt")
+    stats = get_book_stats(df)
+    print(f"✓ Loaded {stats['total_pages']} pages")
+    print(f"✓ Total characters: {stats['total_characters']:,}")
+    print(f"✓ Average chars per page: {stats['average_chars_per_page']:.0f}")
+    assert stats['total_pages'] > 0, "No pages loaded"
+    print("✓ Data loader test PASSED\n")
+    return df
+def test_config_loader():
+    """Test the config loader module."""
+    print("=" * 60)
+    print("Testing Config Loader...")
+    print("=" * 60)
+    from config_loader import cfg
+    print(f"✓ Project name: {cfg.project['name']}")
+    print(f"✓ Embedding model: {cfg.processing['embedding_model']}")
+    print(f"✓ Chunking technique: {cfg.processing['technique']}")
+    print(f"✓ Chunk size: {cfg.processing['chunk_size']}")
+    print(f"✓ Vector DB index: {cfg.db['base_index_name']}")
+    print(f"✓ Retrieval mode: {cfg.retrieval['mode']}")
+    print(f"✓ Rerank strategy: {cfg.retrieval['rerank_strategy']}")
+    print(f"✓ Models to evaluate: {cfg.model_list}")
+    assert cfg.project['name'] == "cbt-rag-system", "Project name mismatch"
+    assert cfg.db['base_index_name'] == "cbt-book", "Index name mismatch"
+    print("✓ Config loader test PASSED\n")
+    return cfg
+def test_chunk_processor(df):
+    """Test the chunk processor module."""
+    print("=" * 60)
+    print("Testing Chunk Processor...")
+    print("=" * 60)
+    from retriever.processor import ChunkProcessor
+    from config_loader import cfg
+    # Test with first 5 pages only to speed up
+    test_df = df.head(5)
+    proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
+    chunks = proc.process(
+        test_df,
+        technique=cfg.processing['technique'],
+        chunk_size=cfg.processing['chunk_size'],
+        chunk_overlap=cfg.processing['chunk_overlap'],
+        max_docs=5,
+        verbose=False
+    )
+    print(f"✓ Created {len(chunks)} chunks from 5 pages")
+    print(f"✓ Sample chunk ID: {chunks[0]['id']}")
+    print(f"✓ Sample chunk size: {len(chunks[0]['metadata']['text'])} chars")
+    assert len(chunks) > 0, "No chunks created"
+    assert 'values' in chunks[0], "Embeddings not generated"
+    assert len(chunks[0]['values']) == 384, f"Expected 384 dimensions, got {len(chunks[0]['values'])}"
+    print("✓ Chunk processor test PASSED\n")
+    return chunks
+def test_generator():
+    """Test the RAG generator module."""
+    print("=" * 60)
+    print("Testing RAG Generator...")
+    print("=" * 60)
+    from retriever.generator import RAGGenerator
+    generator = RAGGenerator()
+    # Test prompt generation
+    test_contexts = [
+        "CBT is a form of psychotherapy developed by Aaron Beck.",
+        "The cognitive model proposes that dysfunctional thinking influences mood and behavior."
+    ]
+    test_query = "What is CBT?"
+    prompt = generator.generate_prompt(test_query, test_contexts)
+    print(f"✓ Generated prompt length: {len(prompt)} chars")
+    print(f"✓ Prompt contains query: {'What is CBT?' in prompt}")
+    print(f"✓ Prompt contains context: {'Aaron Beck' in prompt}")
+    assert "What is CBT?" in prompt, "Query not in prompt"
+    assert "Aaron Beck" in prompt, "Context not in prompt"
+    print("✓ RAG generator test PASSED\n")
+    return generator
+def test_evaluator():
+    """Test the RAG evaluator module."""
+    print("=" * 60)
+    print("Testing RAG Evaluator...")
+    print("=" * 60)
+    from retriever.evaluator import GroqJudge
+    groq_key = os.getenv("GROQ_API_KEY")
+    if not groq_key:
+        print("⚠ GROQ_API_KEY not set, skipping evaluator test")
+        return None
+    judge = GroqJudge(api_key=groq_key, model="llama-3.1-8b-instant")
+    # Simple test prompt
+    test_prompt = "What is 2 + 2? Answer with just the number."
+    response = judge.generate(test_prompt)
+    print(f"✓ Judge response: {response}")
+    print("✓ Evaluator test PASSED\n")
+    return judge
+def main():
+    """Run all tests."""
+    print("\n" + "=" * 60)
+    print("CBT RAG SYSTEM - BACKEND TESTS")
+    print("=" * 60 + "\n")
+    try:
+        # Test data loader
+        df = test_data_loader()
+        # Test config loader
+        cfg = test_config_loader()
+        # Test chunk processor
+        chunks = test_chunk_processor(df)
+        # Test generator
+        generator = test_generator()
+        # Test evaluator
+        judge = test_evaluator()
+        print("=" * 60)
+        print("ALL TESTS PASSED!")
+        print("=" * 60)
+        print("\nBackend is ready for use.")
+        print("\nNext steps:")
+        print("1. Run 'python ingest.py' to index the book into Pinecone")
+        print("2. Run 'python -m uvicorn api:app --host 0.0.0.0 --port 8000' to start the API")
+    except Exception as e:
+        print(f"\n✗ TEST FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()