Merge pull request #4 from ramailkk/Changed_Ingestion_Logic
Browse files- BACKEND_README.md +61 -0
- EntireBookCleaned.txt +0 -0
- api.py +1 -1
- config.yaml +13 -13
- data_loader.py +86 -36
- ingest.py +221 -0
- main.py +115 -25
- requirements.txt +1 -0
- retriever/processor.py +23 -1
- test_backend.py +172 -0
BACKEND_README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CBT RAG System - Backend Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
This is a Retrieval-Augmented Generation (RAG) system for Cognitive Behavioral Therapy (CBT) content. It uses a CBT textbook as the knowledge base and implements hybrid search with re-ranking for accurate question answering.
|
| 6 |
+
|
| 7 |
+
## Embedding Model: Jina v2 Small
|
| 8 |
+
|
| 9 |
+
**Model**: `jinaai/jina-embeddings-v2-small-en`
|
| 10 |
+
|
| 11 |
+
| Property | Value |
|
| 12 |
+
| ---------- | ------ |
|
| 13 |
+
| Dimensions | 512 |
|
| 14 |
+
| Max Tokens | 8,192 |
|
| 15 |
+
| Parameters | 33M |
|
| 16 |
+
| Disk Size | ~130MB |
|
| 17 |
+
|
| 18 |
+
## Ingestion: 6 Chunking Techniques (Single Index)
|
| 19 |
+
|
| 20 |
+
The `ingest.py` script processes the CBT book **6 times** with different chunking strategies for the ablation study. All chunks are stored in a **SINGLE Pinecone index** with metadata to differentiate.
|
| 21 |
+
|
| 22 |
+
| # | Technique | Description |
|
| 23 |
+
| --- | --------- | ------------------------------------------------------------------------------------ |
|
| 24 |
+
| 1 | fixed | Fixed-size chunking - splits every N characters (may cut sentences mid-way) |
|
| 25 |
+
| 2 | sentence | Sentence-level chunking - respects sentence boundaries (NLTK) |
|
| 26 |
+
| 3 | paragraph | Paragraph-level chunking - uses natural paragraph breaks |
|
| 27 |
+
| 4 | semantic | Semantic chunking - splits where topic/meaning shifts (embedding similarity) |
|
| 28 |
+
| 5 | recursive | Recursive chunking - hierarchical splitting (paragraphs β sentences β words β chars) |
|
| 29 |
+
| 6 | page | Page-level chunking - uses entire book pages as-is |
|
| 30 |
+
|
| 31 |
+
**Single Pinecone Index:** `cbt-book-recursive`
|
| 32 |
+
|
| 33 |
+
**Metadata:** Each chunk includes `chunking_technique` field for filtering.
|
| 34 |
+
|
| 35 |
+
## Configuration (`config.yaml`)
|
| 36 |
+
|
| 37 |
+
```yaml
|
| 38 |
+
processing:
|
| 39 |
+
embedding_model: "jinaai/jina-embeddings-v2-small-en"
|
| 40 |
+
technique: "recursive"
|
| 41 |
+
chunk_size: 1000
|
| 42 |
+
chunk_overlap: 100
|
| 43 |
+
|
| 44 |
+
vector_db:
|
| 45 |
+
base_index_name: "cbt-book"
|
| 46 |
+
dimension: 512
|
| 47 |
+
metric: "cosine"
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Running the System
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
source venv/bin/activate
|
| 54 |
+
pip install -r requirements.txt
|
| 55 |
+
python ingest.py
|
| 56 |
+
python -m uvicorn api:app --host 0.0.0.0 --port 8000
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Pinecone Setup
|
| 60 |
+
|
| 61 |
+
Create index `cbt-book-recursive` with 512 dimensions, cosine metric.
|
EntireBookCleaned.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
api.py
CHANGED
|
@@ -208,7 +208,7 @@ def startup_event() -> None:
|
|
| 208 |
if not hf_token:
|
| 209 |
raise RuntimeError("HF_TOKEN not found in environment variables")
|
| 210 |
|
| 211 |
-
index_name = "
|
| 212 |
embed_model_name = "all-MiniLM-L6-v2"
|
| 213 |
project_root = os.path.dirname(os.path.abspath(__file__))
|
| 214 |
cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))
|
|
|
|
| 208 |
if not hf_token:
|
| 209 |
raise RuntimeError("HF_TOKEN not found in environment variables")
|
| 210 |
|
| 211 |
+
index_name = "cbt-book-recursive"
|
| 212 |
embed_model_name = "all-MiniLM-L6-v2"
|
| 213 |
project_root = os.path.dirname(os.path.abspath(__file__))
|
| 214 |
cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))
|
config.yaml
CHANGED
|
@@ -1,26 +1,26 @@
|
|
| 1 |
# ------------------------------------------------------------------
|
| 2 |
-
# RAG
|
| 3 |
# ------------------------------------------------------------------
|
| 4 |
|
| 5 |
project:
|
| 6 |
-
name: "
|
| 7 |
-
category: "
|
| 8 |
-
doc_limit:
|
| 9 |
|
| 10 |
processing:
|
| 11 |
# Embedding model used for both vector db and evaluator similarity
|
| 12 |
-
embedding_model: "
|
| 13 |
# Options: sentence, recursive, semantic, fixed
|
| 14 |
-
technique: "recursive"
|
| 15 |
-
#
|
| 16 |
-
chunk_size:
|
| 17 |
-
chunk_overlap:
|
| 18 |
|
| 19 |
vector_db:
|
| 20 |
-
base_index_name: "
|
| 21 |
-
dimension:
|
| 22 |
metric: "cosine"
|
| 23 |
-
batch_size:
|
| 24 |
|
| 25 |
retrieval:
|
| 26 |
# Options: hybrid, semantic, bm25
|
|
@@ -43,4 +43,4 @@ models:
|
|
| 43 |
- "Mistral-7B"
|
| 44 |
- "Qwen-2.5"
|
| 45 |
- "DeepSeek-V3"
|
| 46 |
-
- "TinyAya"
|
|
|
|
| 1 |
# ------------------------------------------------------------------
|
| 2 |
+
# RAG CBT QUESTION-ANSWERING SYSTEM CONFIGURATION
|
| 3 |
# ------------------------------------------------------------------
|
| 4 |
|
| 5 |
project:
|
| 6 |
+
name: "cbt-rag-system"
|
| 7 |
+
category: "psychology"
|
| 8 |
+
doc_limit: null # Load all pages from the book
|
| 9 |
|
| 10 |
processing:
|
| 11 |
# Embedding model used for both vector db and evaluator similarity
|
| 12 |
+
embedding_model: "jinaai/jina-embeddings-v2-small-en"
|
| 13 |
# Options: sentence, recursive, semantic, fixed
|
| 14 |
+
technique: "recursive"
|
| 15 |
+
# Jina supports 8192 tokens (~32k chars), using 1000 chars for better context
|
| 16 |
+
chunk_size: 1000
|
| 17 |
+
chunk_overlap: 100
|
| 18 |
|
| 19 |
vector_db:
|
| 20 |
+
base_index_name: "cbt-book"
|
| 21 |
+
dimension: 512 # Jina outputs 512 dimensions
|
| 22 |
metric: "cosine"
|
| 23 |
+
batch_size: 50 # Reduced batch size for CPU processing
|
| 24 |
|
| 25 |
retrieval:
|
| 26 |
# Options: hybrid, semantic, bm25
|
|
|
|
| 43 |
- "Mistral-7B"
|
| 44 |
- "Qwen-2.5"
|
| 45 |
- "DeepSeek-V3"
|
| 46 |
+
- "TinyAya"
|
data_loader.py
CHANGED
|
@@ -1,41 +1,91 @@
|
|
| 1 |
-
import
|
| 2 |
-
import requests
|
| 3 |
-
import io
|
| 4 |
-
import arxiv
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
try:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
def
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
results = []
|
| 30 |
-
for r in client.results(search):
|
| 31 |
-
print(f"Downloading full text for: {r.title[:50]}...")
|
| 32 |
-
full_text = extract_text_from_url(r.pdf_url)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
|
|
|
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
|
| 5 |
+
|
| 6 |
+
def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame:
|
| 7 |
+
"""
|
| 8 |
+
Loads the CBT book from a text file and parses it into documents.
|
| 9 |
+
Each page is treated as a separate document.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
file_path: Path to the cleaned book text file
|
| 13 |
+
|
| 14 |
+
Returns:
|
| 15 |
+
DataFrame with columns: id, title, url, full_text
|
| 16 |
+
"""
|
| 17 |
try:
|
| 18 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 19 |
+
content = f.read()
|
| 20 |
+
except FileNotFoundError:
|
| 21 |
+
raise FileNotFoundError(f"Book file not found: {file_path}")
|
| 22 |
+
|
| 23 |
+
# Split content by page markers
|
| 24 |
+
# Pattern matches "--- Page X ---" or "--- Page X of Y ---"
|
| 25 |
+
page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---'
|
| 26 |
+
|
| 27 |
+
# Split the content into pages
|
| 28 |
+
pages = re.split(page_pattern, content)
|
| 29 |
+
|
| 30 |
+
# pages[0] is content before first page marker (usually empty)
|
| 31 |
+
# Then alternating: page_number, page_content, page_number, page_content...
|
| 32 |
+
|
| 33 |
+
documents = []
|
| 34 |
+
i = 1 # Start from first page number
|
| 35 |
+
|
| 36 |
+
while i < len(pages) - 1:
|
| 37 |
+
page_num = pages[i].strip()
|
| 38 |
+
page_content = pages[i + 1].strip() if i + 1 < len(pages) else ""
|
| 39 |
+
|
| 40 |
+
# Clean up the content - remove excessive whitespace
|
| 41 |
+
page_content = re.sub(r'\n{3,}', '\n\n', page_content)
|
| 42 |
+
page_content = page_content.strip()
|
| 43 |
+
|
| 44 |
+
if page_content: # Only add non-empty pages
|
| 45 |
+
# Extract a title from the first line if possible
|
| 46 |
+
lines = page_content.split('\n')
|
| 47 |
+
title_line = lines[0].strip() if lines else f"Page {page_num}"
|
| 48 |
+
|
| 49 |
+
# Use first meaningful line as title, or default to page number
|
| 50 |
+
if len(title_line) > 10 and len(title_line) < 200:
|
| 51 |
+
title = title_line
|
| 52 |
+
else:
|
| 53 |
+
title = f"CBT Book - Page {page_num}"
|
| 54 |
+
|
| 55 |
+
documents.append({
|
| 56 |
+
"id": f"cbt-page-{page_num}",
|
| 57 |
+
"title": title,
|
| 58 |
+
"url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}",
|
| 59 |
+
"full_text": page_content
|
| 60 |
+
})
|
| 61 |
+
|
| 62 |
+
i += 2 # Move to next page number
|
| 63 |
+
|
| 64 |
+
if not documents:
|
| 65 |
+
raise ValueError("No documents were parsed from the book file")
|
| 66 |
+
|
| 67 |
+
df = pd.DataFrame(documents)
|
| 68 |
+
print(f"Loaded {len(df)} pages from CBT book")
|
| 69 |
+
return df
|
| 70 |
+
|
| 71 |
|
| 72 |
+
def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]:
|
| 73 |
+
"""
|
| 74 |
+
Get statistics about the loaded book.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
df: DataFrame containing book pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
Returns:
|
| 80 |
+
Dictionary with statistics
|
| 81 |
+
"""
|
| 82 |
+
total_chars = df['full_text'].str.len().sum()
|
| 83 |
+
avg_chars = df['full_text'].str.len().mean()
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
"total_pages": len(df),
|
| 87 |
+
"total_characters": total_chars,
|
| 88 |
+
"average_chars_per_page": round(avg_chars, 2),
|
| 89 |
+
"min_chars": df['full_text'].str.len().min(),
|
| 90 |
+
"max_chars": df['full_text'].str.len().max()
|
| 91 |
+
}
|
ingest.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Script to ingest CBT book data into Pinecone vector database.
|
| 3 |
+
Ingests the book 6 times with different chunking formats for ablation study.
|
| 4 |
+
All chunks are stored in a SINGLE index with metadata to differentiate.
|
| 5 |
+
Run this once before starting the API server.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from config_loader import cfg
|
| 11 |
+
from data_loader import load_cbt_book, get_book_stats
|
| 12 |
+
from vector_db import get_pinecone_index, refresh_pinecone_index
|
| 13 |
+
from retriever.processor import ChunkProcessor
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# 6 different chunking techniques for ablation study
|
| 17 |
+
CHUNKING_TECHNIQUES = [
|
| 18 |
+
{
|
| 19 |
+
"name": "fixed",
|
| 20 |
+
"description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
|
| 21 |
+
"chunk_size": 1000,
|
| 22 |
+
"chunk_overlap": 100,
|
| 23 |
+
"kwargs": {"separator": ""}, # No separator for fixed splitting
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"name": "sentence",
|
| 27 |
+
"description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
|
| 28 |
+
"chunk_size": 2400,
|
| 29 |
+
"chunk_overlap": 100,
|
| 30 |
+
"kwargs": {},
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "paragraph",
|
| 34 |
+
"description": "Paragraph-level chunking - uses natural paragraph breaks",
|
| 35 |
+
"chunk_size": 4000,
|
| 36 |
+
"chunk_overlap": 100,
|
| 37 |
+
"kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"name": "semantic",
|
| 41 |
+
"description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
|
| 42 |
+
"chunk_size": 2000,
|
| 43 |
+
"chunk_overlap": 100,
|
| 44 |
+
"kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"name": "recursive",
|
| 48 |
+
"description": "Recursive chunking - hierarchical splitting (paragraphs β sentences β words β chars)",
|
| 49 |
+
"chunk_size": 2000,
|
| 50 |
+
"chunk_overlap": 100,
|
| 51 |
+
"kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"name": "page",
|
| 55 |
+
"description": "Page-level chunking - uses entire book pages as-is",
|
| 56 |
+
"chunk_size": 10000, # Very large to keep full pages
|
| 57 |
+
"chunk_overlap": 0, # No overlap between pages
|
| 58 |
+
"kwargs": {"separator": "--- Page"}, # Split on page markers
|
| 59 |
+
},
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def ingest_single_technique(
|
| 64 |
+
raw_data,
|
| 65 |
+
proc,
|
| 66 |
+
technique_config,
|
| 67 |
+
technique_index,
|
| 68 |
+
total_techniques,
|
| 69 |
+
):
|
| 70 |
+
"""Chunk the book using a single technique and return chunks with metadata."""
|
| 71 |
+
technique_name = technique_config["name"]
|
| 72 |
+
chunk_size = technique_config["chunk_size"]
|
| 73 |
+
chunk_overlap = technique_config["chunk_overlap"]
|
| 74 |
+
kwargs = technique_config.get("kwargs", {})
|
| 75 |
+
|
| 76 |
+
print(f"\n[{technique_index}/{total_techniques}] Processing '{technique_name}'...")
|
| 77 |
+
print(f" Description: {technique_config['description']}")
|
| 78 |
+
print(f" Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
|
| 79 |
+
|
| 80 |
+
# Chunk and embed
|
| 81 |
+
final_chunks = proc.process(
|
| 82 |
+
raw_data,
|
| 83 |
+
technique=technique_name,
|
| 84 |
+
chunk_size=chunk_size,
|
| 85 |
+
chunk_overlap=chunk_overlap,
|
| 86 |
+
max_docs=cfg.project.get("doc_limit"),
|
| 87 |
+
verbose=False,
|
| 88 |
+
**kwargs,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Add technique metadata to each chunk for differentiation
|
| 92 |
+
# Prefix ID with technique name to ensure uniqueness across techniques
|
| 93 |
+
for chunk in final_chunks:
|
| 94 |
+
chunk["metadata"]["chunking_technique"] = technique_name
|
| 95 |
+
chunk["id"] = f"{technique_name}-{chunk['id']}"
|
| 96 |
+
|
| 97 |
+
print(f" Created {len(final_chunks)} chunks")
|
| 98 |
+
|
| 99 |
+
return final_chunks
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def ingest_data():
|
| 103 |
+
"""Load CBT book, chunk it 6 ways, and upload ALL to a SINGLE Pinecone index."""
|
| 104 |
+
load_dotenv()
|
| 105 |
+
|
| 106 |
+
pinecone_key = os.getenv("PINECONE_API_KEY")
|
| 107 |
+
if not pinecone_key:
|
| 108 |
+
raise RuntimeError("PINECONE_API_KEY not found in environment variables")
|
| 109 |
+
|
| 110 |
+
print("=" * 80)
|
| 111 |
+
print("CBT BOOK INGESTION PIPELINE - 6 TECHNIQUES (SINGLE INDEX)")
|
| 112 |
+
print("=" * 80)
|
| 113 |
+
print(f"\nTechniques to process: {len(CHUNKING_TECHNIQUES)}")
|
| 114 |
+
for i, tech in enumerate(CHUNKING_TECHNIQUES, 1):
|
| 115 |
+
print(f" {i}. {tech['name']}: {tech['description']}")
|
| 116 |
+
print(f"\nAll chunks will be stored in a SINGLE index: {cfg.db['base_index_name']}-{cfg.processing['technique']}")
|
| 117 |
+
print("Chunks are differentiated by 'chunking_technique' metadata field.")
|
| 118 |
+
|
| 119 |
+
# 1. Load the CBT book (once, reused for all techniques)
|
| 120 |
+
print(f"\n{'='*80}")
|
| 121 |
+
print("STEP 1: LOADING CBT BOOK")
|
| 122 |
+
print(f"{'='*80}")
|
| 123 |
+
print("\nLoading CBT book from EntireBookCleaned.txt...")
|
| 124 |
+
raw_data = load_cbt_book("EntireBookCleaned.txt")
|
| 125 |
+
stats = get_book_stats(raw_data)
|
| 126 |
+
print(f" Loaded {stats['total_pages']} pages")
|
| 127 |
+
print(f" Total characters: {stats['total_characters']:,}")
|
| 128 |
+
print(f" Average chars per page: {stats['average_chars_per_page']:.0f}")
|
| 129 |
+
|
| 130 |
+
# 2. Initialize processor (once, reused for all techniques)
|
| 131 |
+
print(f"\nInitializing embedding model: {cfg.processing['embedding_model']}")
|
| 132 |
+
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
|
| 133 |
+
|
| 134 |
+
# 3. Process each technique sequentially and collect all chunks
|
| 135 |
+
print(f"\n{'='*80}")
|
| 136 |
+
print("STEP 2: CHUNKING WITH 6 TECHNIQUES")
|
| 137 |
+
print(f"{'='*80}")
|
| 138 |
+
|
| 139 |
+
all_chunks = []
|
| 140 |
+
results = {}
|
| 141 |
+
|
| 142 |
+
for i, technique in enumerate(CHUNKING_TECHNIQUES, 1):
|
| 143 |
+
try:
|
| 144 |
+
chunks = ingest_single_technique(
|
| 145 |
+
raw_data=raw_data,
|
| 146 |
+
proc=proc,
|
| 147 |
+
technique_config=technique,
|
| 148 |
+
technique_index=i,
|
| 149 |
+
total_techniques=len(CHUNKING_TECHNIQUES),
|
| 150 |
+
)
|
| 151 |
+
all_chunks.extend(chunks)
|
| 152 |
+
results[technique["name"]] = {
|
| 153 |
+
"status": "success",
|
| 154 |
+
"chunks": len(chunks),
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# Wait between techniques to avoid rate limits (for embedding API)
|
| 158 |
+
if i < len(CHUNKING_TECHNIQUES):
|
| 159 |
+
print(f" Waiting 5 seconds before next technique (rate limit protection)...")
|
| 160 |
+
time.sleep(5)
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f" ERROR with technique '{technique['name']}': {e}")
|
| 164 |
+
results[technique["name"]] = {
|
| 165 |
+
"status": "failed",
|
| 166 |
+
"error": str(e),
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
# 4. Upload ALL chunks to a SINGLE Pinecone index
|
| 170 |
+
print(f"\n{'='*80}")
|
| 171 |
+
print("STEP 3: UPLOADING TO SINGLE PINECONE INDEX")
|
| 172 |
+
print(f"{'='*80}")
|
| 173 |
+
|
| 174 |
+
index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
|
| 175 |
+
print(f"\nIndex name: {index_name}")
|
| 176 |
+
print(f"Dimension: {cfg.db['dimension']}")
|
| 177 |
+
print(f"Metric: {cfg.db['metric']}")
|
| 178 |
+
print(f"Total chunks to upload: {len(all_chunks)}")
|
| 179 |
+
|
| 180 |
+
index = get_pinecone_index(
|
| 181 |
+
pinecone_key,
|
| 182 |
+
cfg.db['base_index_name'],
|
| 183 |
+
technique=cfg.processing['technique'],
|
| 184 |
+
dimension=cfg.db['dimension'],
|
| 185 |
+
metric=cfg.db['metric'],
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
print(f"\nUploading {len(all_chunks)} vectors to Pinecone...")
|
| 189 |
+
refresh_pinecone_index(index, all_chunks, batch_size=cfg.db['batch_size'])
|
| 190 |
+
|
| 191 |
+
# 5. Summary
|
| 192 |
+
print(f"\n{'='*80}")
|
| 193 |
+
print("INGESTION COMPLETE - SUMMARY")
|
| 194 |
+
print(f"{'='*80}")
|
| 195 |
+
print(f"\n{'Technique':<15} {'Status':<12} {'Chunks':<10}")
|
| 196 |
+
print("-" * 40)
|
| 197 |
+
total_chunks = 0
|
| 198 |
+
for tech in CHUNKING_TECHNIQUES:
|
| 199 |
+
name = tech["name"]
|
| 200 |
+
result = results.get(name, {})
|
| 201 |
+
status = result.get("status", "unknown")
|
| 202 |
+
chunks = result.get("chunks", 0)
|
| 203 |
+
if status == "success":
|
| 204 |
+
total_chunks += chunks
|
| 205 |
+
print(f"{name:<15} {status:<12} {chunks:<10}")
|
| 206 |
+
print("-" * 40)
|
| 207 |
+
print(f"{'TOTAL':<15} {'':<12} {total_chunks:<10}")
|
| 208 |
+
|
| 209 |
+
print(f"\nSingle index: {index_name}")
|
| 210 |
+
print(f"Total vectors: {len(all_chunks)}")
|
| 211 |
+
print("\nChunks can be filtered by 'chunking_technique' metadata field:")
|
| 212 |
+
for tech in CHUNKING_TECHNIQUES:
|
| 213 |
+
if results.get(tech["name"], {}).get("status") == "success":
|
| 214 |
+
print(f" - chunking_technique: '{tech['name']}'")
|
| 215 |
+
|
| 216 |
+
print("\nYou can now start the API server with:")
|
| 217 |
+
print(" python -m uvicorn api:app --host 0.0.0.0 --port 8000")
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
if __name__ == "__main__":
|
| 221 |
+
ingest_data()
|
main.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
-
from config_loader import cfg
|
| 4 |
|
| 5 |
from vector_db import get_pinecone_index, refresh_pinecone_index
|
| 6 |
from retriever.retriever import HybridRetriever
|
| 7 |
from retriever.generator import RAGGenerator
|
| 8 |
from retriever.processor import ChunkProcessor
|
| 9 |
from retriever.evaluator import RAGEvaluator
|
| 10 |
-
|
| 11 |
|
| 12 |
-
# Import
|
| 13 |
from models.llama_3_8b import Llama3_8B
|
| 14 |
from models.mistral_7b import Mistral_7b
|
| 15 |
from models.qwen_2_5 import Qwen2_5
|
|
@@ -26,16 +26,36 @@ MODEL_MAP = {
|
|
| 26 |
|
| 27 |
load_dotenv()
|
| 28 |
|
|
|
|
| 29 |
def main():
|
|
|
|
| 30 |
hf_token = os.getenv("HF_TOKEN")
|
| 31 |
pinecone_key = os.getenv("PINECONE_API_KEY")
|
| 32 |
-
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# 2. Chunking & Embedding
|
| 41 |
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'])
|
|
@@ -43,20 +63,38 @@ def main():
|
|
| 43 |
raw_data,
|
| 44 |
technique=cfg.processing['technique'],
|
| 45 |
chunk_size=cfg.processing['chunk_size'],
|
| 46 |
-
chunk_overlap=cfg.processing['chunk_overlap']
|
|
|
|
|
|
|
| 47 |
)
|
| 48 |
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
index = get_pinecone_index(
|
| 51 |
-
pinecone_key,
|
| 52 |
-
cfg.db['base_index_name'],
|
| 53 |
technique=cfg.processing['technique'],
|
| 54 |
dimension=cfg.db['dimension']
|
| 55 |
)
|
| 56 |
refresh_pinecone_index(index, final_chunks, batch_size=cfg.db['batch_size'])
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
retriever = HybridRetriever(final_chunks, proc.encoder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
context_chunks = retriever.search(
|
| 61 |
query, index,
|
| 62 |
mode=cfg.retrieval['mode'],
|
|
@@ -66,44 +104,96 @@ def main():
|
|
| 66 |
final_k=cfg.retrieval['final_k']
|
| 67 |
)
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
rag_engine = RAGGenerator()
|
| 71 |
models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
|
| 72 |
-
|
| 73 |
-
# Setup Evaluator with the designated Judge
|
| 74 |
|
|
|
|
| 75 |
evaluator = RAGEvaluator(
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
)
|
| 80 |
|
| 81 |
tournament_results = {}
|
| 82 |
|
| 83 |
-
#
|
| 84 |
for name, model_inst in models.items():
|
| 85 |
-
print(f"\n
|
|
|
|
|
|
|
| 86 |
try:
|
| 87 |
# Generation
|
| 88 |
answer = rag_engine.get_answer(
|
| 89 |
-
model_inst, query, context_chunks,
|
| 90 |
temperature=cfg.gen['temperature']
|
| 91 |
)
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
| 93 |
# Faithfulness Evaluation
|
| 94 |
faith = evaluator.evaluate_faithfulness(answer, context_chunks)
|
| 95 |
# Relevancy Evaluation
|
| 96 |
rel = evaluator.evaluate_relevancy(query, answer)
|
| 97 |
|
| 98 |
tournament_results[name] = {
|
|
|
|
| 99 |
"Faithfulness": faith['score'],
|
| 100 |
"Relevancy": rel['score'],
|
| 101 |
"Claims": faith['details']
|
| 102 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
except Exception as e:
|
| 104 |
print(f"Error evaluating {name}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
# 7. Final Output (Omitted for brevity, use your existing report logic)
|
| 107 |
|
| 108 |
if __name__ == "__main__":
|
| 109 |
main()
|
|
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
+
from config_loader import cfg
|
| 4 |
|
| 5 |
from vector_db import get_pinecone_index, refresh_pinecone_index
|
| 6 |
from retriever.retriever import HybridRetriever
|
| 7 |
from retriever.generator import RAGGenerator
|
| 8 |
from retriever.processor import ChunkProcessor
|
| 9 |
from retriever.evaluator import RAGEvaluator
|
| 10 |
+
from data_loader import load_cbt_book, get_book_stats
|
| 11 |
|
| 12 |
+
# Import model fleet
|
| 13 |
from models.llama_3_8b import Llama3_8B
|
| 14 |
from models.mistral_7b import Mistral_7b
|
| 15 |
from models.qwen_2_5 import Qwen2_5
|
|
|
|
| 26 |
|
| 27 |
load_dotenv()
|
| 28 |
|
| 29 |
+
|
| 30 |
def main():
|
| 31 |
+
"""Main function to run the RAG tournament on CBT book."""
|
| 32 |
hf_token = os.getenv("HF_TOKEN")
|
| 33 |
pinecone_key = os.getenv("PINECONE_API_KEY")
|
| 34 |
+
groq_key = os.getenv("GROQ_API_KEY")
|
| 35 |
|
| 36 |
+
# Verify environment variables
|
| 37 |
+
if not hf_token:
|
| 38 |
+
raise RuntimeError("HF_TOKEN not found in environment variables")
|
| 39 |
+
if not pinecone_key:
|
| 40 |
+
raise RuntimeError("PINECONE_API_KEY not found in environment variables")
|
| 41 |
+
if not groq_key:
|
| 42 |
+
raise RuntimeError("GROQ_API_KEY not found in environment variables")
|
| 43 |
+
|
| 44 |
+
# Example query for testing
|
| 45 |
+
query = "What is cognitive behavior therapy and how does it work?"
|
| 46 |
+
|
| 47 |
+
print("=" * 80)
|
| 48 |
+
print("CBT RAG SYSTEM - LOADING DATA")
|
| 49 |
+
print("=" * 80)
|
| 50 |
+
|
| 51 |
+
# 1. Data Ingestion - Load CBT Book
|
| 52 |
+
raw_data = load_cbt_book("EntireBookCleaned.txt")
|
| 53 |
+
stats = get_book_stats(raw_data)
|
| 54 |
+
print(f"Book Statistics: {stats}")
|
| 55 |
+
|
| 56 |
+
print("\n" + "=" * 80)
|
| 57 |
+
print("CHUNKING AND EMBEDDING")
|
| 58 |
+
print("=" * 80)
|
| 59 |
|
| 60 |
# 2. Chunking & Embedding
|
| 61 |
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'])
|
|
|
|
| 63 |
raw_data,
|
| 64 |
technique=cfg.processing['technique'],
|
| 65 |
chunk_size=cfg.processing['chunk_size'],
|
| 66 |
+
chunk_overlap=cfg.processing['chunk_overlap'],
|
| 67 |
+
max_docs=cfg.project.get('doc_limit'), # None means load all
|
| 68 |
+
verbose=True
|
| 69 |
)
|
| 70 |
|
| 71 |
+
print(f"\nTotal chunks created: {len(final_chunks)}")
|
| 72 |
+
|
| 73 |
+
print("\n" + "=" * 80)
|
| 74 |
+
print("VECTOR DATABASE SETUP")
|
| 75 |
+
print("=" * 80)
|
| 76 |
+
|
| 77 |
+
# 3. Vector DB - Create/Update Pinecone Index
|
| 78 |
index = get_pinecone_index(
|
| 79 |
+
pinecone_key,
|
| 80 |
+
cfg.db['base_index_name'],
|
| 81 |
technique=cfg.processing['technique'],
|
| 82 |
dimension=cfg.db['dimension']
|
| 83 |
)
|
| 84 |
refresh_pinecone_index(index, final_chunks, batch_size=cfg.db['batch_size'])
|
| 85 |
|
| 86 |
+
print("\n" + "=" * 80)
|
| 87 |
+
print("RETRIEVAL SETUP")
|
| 88 |
+
print("=" * 80)
|
| 89 |
+
|
| 90 |
+
# 4. Retrieval Setup
|
| 91 |
retriever = HybridRetriever(final_chunks, proc.encoder)
|
| 92 |
+
|
| 93 |
+
print("\n" + "=" * 80)
|
| 94 |
+
print(f"TESTING QUERY: {query}")
|
| 95 |
+
print("=" * 80)
|
| 96 |
+
|
| 97 |
+
# Test retrieval
|
| 98 |
context_chunks = retriever.search(
|
| 99 |
query, index,
|
| 100 |
mode=cfg.retrieval['mode'],
|
|
|
|
| 104 |
final_k=cfg.retrieval['final_k']
|
| 105 |
)
|
| 106 |
|
| 107 |
+
print(f"\nRetrieved {len(context_chunks)} context chunks")
|
| 108 |
+
|
| 109 |
+
print("\n" + "=" * 80)
|
| 110 |
+
print("MODEL TOURNAMENT")
|
| 111 |
+
print("=" * 80)
|
| 112 |
+
|
| 113 |
+
# 5. Initialize Models
|
| 114 |
rag_engine = RAGGenerator()
|
| 115 |
models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
# 6. Setup Evaluator with Judge
|
| 118 |
evaluator = RAGEvaluator(
|
| 119 |
+
judge_model=cfg.gen['judge_model'],
|
| 120 |
+
embedding_model=proc.encoder,
|
| 121 |
+
api_key=groq_key
|
| 122 |
)
|
| 123 |
|
| 124 |
tournament_results = {}
|
| 125 |
|
| 126 |
+
# 7. Tournament Loop
|
| 127 |
for name, model_inst in models.items():
|
| 128 |
+
print(f"\n{'='*60}")
|
| 129 |
+
print(f"Processing {name}")
|
| 130 |
+
print('='*60)
|
| 131 |
try:
|
| 132 |
# Generation
|
| 133 |
answer = rag_engine.get_answer(
|
| 134 |
+
model_inst, query, context_chunks,
|
| 135 |
temperature=cfg.gen['temperature']
|
| 136 |
)
|
| 137 |
+
|
| 138 |
+
print(f"\nAnswer from {name}:")
|
| 139 |
+
print(answer[:500] + "..." if len(answer) > 500 else answer)
|
| 140 |
+
|
| 141 |
# Faithfulness Evaluation
|
| 142 |
faith = evaluator.evaluate_faithfulness(answer, context_chunks)
|
| 143 |
# Relevancy Evaluation
|
| 144 |
rel = evaluator.evaluate_relevancy(query, answer)
|
| 145 |
|
| 146 |
tournament_results[name] = {
|
| 147 |
+
"answer": answer,
|
| 148 |
"Faithfulness": faith['score'],
|
| 149 |
"Relevancy": rel['score'],
|
| 150 |
"Claims": faith['details']
|
| 151 |
}
|
| 152 |
+
|
| 153 |
+
print(f"\n{name} Results:")
|
| 154 |
+
print(f" Faithfulness: {faith['score']:.1f}%")
|
| 155 |
+
print(f" Relevancy: {rel['score']:.3f}")
|
| 156 |
+
|
| 157 |
except Exception as e:
|
| 158 |
print(f"Error evaluating {name}: {e}")
|
| 159 |
+
tournament_results[name] = {
|
| 160 |
+
"answer": "",
|
| 161 |
+
"Faithfulness": 0,
|
| 162 |
+
"Relevancy": 0,
|
| 163 |
+
"Claims": [],
|
| 164 |
+
"error": str(e)
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
# 8. Final Results Summary
|
| 168 |
+
print("\n" + "=" * 80)
|
| 169 |
+
print("TOURNAMENT RESULTS SUMMARY")
|
| 170 |
+
print("=" * 80)
|
| 171 |
+
|
| 172 |
+
print(f"\nQuery: {query}")
|
| 173 |
+
print(f"\nRetrieved Context Chunks: {len(context_chunks)}")
|
| 174 |
+
print("\n" + "-" * 60)
|
| 175 |
+
print(f"{'Model':<20} {'Faithfulness':>15} {'Relevancy':>15}")
|
| 176 |
+
print("-" * 60)
|
| 177 |
+
|
| 178 |
+
for name, results in tournament_results.items():
|
| 179 |
+
faith = results.get('Faithfulness', 0)
|
| 180 |
+
rel = results.get('Relevancy', 0)
|
| 181 |
+
print(f"{name:<20} {faith:>14.1f}% {rel:>15.3f}")
|
| 182 |
+
|
| 183 |
+
print("-" * 60)
|
| 184 |
+
|
| 185 |
+
# Find best model
|
| 186 |
+
if tournament_results:
|
| 187 |
+
best_model = max(
|
| 188 |
+
tournament_results.items(),
|
| 189 |
+
key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
|
| 190 |
+
)
|
| 191 |
+
print(f"\nBest Overall Model: {best_model[0]}")
|
| 192 |
+
print(f" Faithfulness: {best_model[1]['Faithfulness']:.1f}%")
|
| 193 |
+
print(f" Relevancy: {best_model[1]['Relevancy']:.3f}")
|
| 194 |
+
|
| 195 |
+
return tournament_results
|
| 196 |
|
|
|
|
| 197 |
|
| 198 |
if __name__ == "__main__":
|
| 199 |
main()
|
requirements.txt
CHANGED
|
@@ -93,3 +93,4 @@ uuid_utils==0.14.1
|
|
| 93 |
xxhash==3.6.0
|
| 94 |
yarl==1.23.0
|
| 95 |
zstandard==0.25.0
|
|
|
|
|
|
| 93 |
xxhash==3.6.0
|
| 94 |
yarl==1.23.0
|
| 95 |
zstandard==0.25.0
|
| 96 |
+
groq==0.13.0
|
retriever/processor.py
CHANGED
|
@@ -37,8 +37,10 @@ class ChunkProcessor:
|
|
| 37 |
- "fixed": Character-based, may split mid-sentence
|
| 38 |
- "recursive": Recursive character splitting with hierarchical separators
|
| 39 |
- "character": Character-based splitting on paragraph boundaries
|
|
|
|
| 40 |
- "sentence": Sliding window over NLTK sentences
|
| 41 |
- "semantic": Embedding-based semantic chunking
|
|
|
|
| 42 |
"""
|
| 43 |
if technique == "fixed":
|
| 44 |
return CharacterTextSplitter(
|
|
@@ -67,6 +69,16 @@ class ChunkProcessor:
|
|
| 67 |
is_separator_regex=False
|
| 68 |
)
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
elif technique == "sentence":
|
| 71 |
# sentence-level chunking using NLTK
|
| 72 |
return NLTKTextSplitter(
|
|
@@ -83,8 +95,18 @@ class ChunkProcessor:
|
|
| 83 |
breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
|
| 84 |
)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
else:
|
| 87 |
-
raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, sentence, semantic")
|
| 88 |
|
| 89 |
# ------------------------------------------------------------------
|
| 90 |
# Processing
|
|
|
|
| 37 |
- "fixed": Character-based, may split mid-sentence
|
| 38 |
- "recursive": Recursive character splitting with hierarchical separators
|
| 39 |
- "character": Character-based splitting on paragraph boundaries
|
| 40 |
+
- "paragraph": Paragraph-level splitting on \\n\\n boundaries
|
| 41 |
- "sentence": Sliding window over NLTK sentences
|
| 42 |
- "semantic": Embedding-based semantic chunking
|
| 43 |
+
- "page": Page-level splitting on page markers
|
| 44 |
"""
|
| 45 |
if technique == "fixed":
|
| 46 |
return CharacterTextSplitter(
|
|
|
|
| 69 |
is_separator_regex=False
|
| 70 |
)
|
| 71 |
|
| 72 |
+
elif technique == "paragraph":
|
| 73 |
+
# Paragraph-level chunking using paragraph breaks
|
| 74 |
+
return CharacterTextSplitter(
|
| 75 |
+
separator=kwargs.get('separator', "\n\n"),
|
| 76 |
+
chunk_size=chunk_size,
|
| 77 |
+
chunk_overlap=chunk_overlap,
|
| 78 |
+
length_function=len,
|
| 79 |
+
is_separator_regex=False
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
elif technique == "sentence":
|
| 83 |
# sentence-level chunking using NLTK
|
| 84 |
return NLTKTextSplitter(
|
|
|
|
| 95 |
breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
|
| 96 |
)
|
| 97 |
|
| 98 |
+
elif technique == "page":
|
| 99 |
+
# Page-level chunking using page markers
|
| 100 |
+
return CharacterTextSplitter(
|
| 101 |
+
separator=kwargs.get('separator', "--- Page"),
|
| 102 |
+
chunk_size=chunk_size,
|
| 103 |
+
chunk_overlap=chunk_overlap,
|
| 104 |
+
length_function=len,
|
| 105 |
+
is_separator_regex=False
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
else:
|
| 109 |
+
raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, paragraph, sentence, semantic, page")
|
| 110 |
|
| 111 |
# ------------------------------------------------------------------
|
| 112 |
# Processing
|
test_backend.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple test script to verify backend components work correctly.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
def test_data_loader():
|
| 12 |
+
"""Test the data loader module."""
|
| 13 |
+
print("=" * 60)
|
| 14 |
+
print("Testing Data Loader...")
|
| 15 |
+
print("=" * 60)
|
| 16 |
+
|
| 17 |
+
from data_loader import load_cbt_book, get_book_stats
|
| 18 |
+
|
| 19 |
+
df = load_cbt_book("EntireBookCleaned.txt")
|
| 20 |
+
stats = get_book_stats(df)
|
| 21 |
+
|
| 22 |
+
print(f"β Loaded {stats['total_pages']} pages")
|
| 23 |
+
print(f"β Total characters: {stats['total_characters']:,}")
|
| 24 |
+
print(f"β Average chars per page: {stats['average_chars_per_page']:.0f}")
|
| 25 |
+
|
| 26 |
+
assert stats['total_pages'] > 0, "No pages loaded"
|
| 27 |
+
print("β Data loader test PASSED\n")
|
| 28 |
+
return df
|
| 29 |
+
|
| 30 |
+
def test_config_loader():
|
| 31 |
+
"""Test the config loader module."""
|
| 32 |
+
print("=" * 60)
|
| 33 |
+
print("Testing Config Loader...")
|
| 34 |
+
print("=" * 60)
|
| 35 |
+
|
| 36 |
+
from config_loader import cfg
|
| 37 |
+
|
| 38 |
+
print(f"β Project name: {cfg.project['name']}")
|
| 39 |
+
print(f"β Embedding model: {cfg.processing['embedding_model']}")
|
| 40 |
+
print(f"β Chunking technique: {cfg.processing['technique']}")
|
| 41 |
+
print(f"β Chunk size: {cfg.processing['chunk_size']}")
|
| 42 |
+
print(f"β Vector DB index: {cfg.db['base_index_name']}")
|
| 43 |
+
print(f"β Retrieval mode: {cfg.retrieval['mode']}")
|
| 44 |
+
print(f"β Rerank strategy: {cfg.retrieval['rerank_strategy']}")
|
| 45 |
+
print(f"β Models to evaluate: {cfg.model_list}")
|
| 46 |
+
|
| 47 |
+
assert cfg.project['name'] == "cbt-rag-system", "Project name mismatch"
|
| 48 |
+
assert cfg.db['base_index_name'] == "cbt-book", "Index name mismatch"
|
| 49 |
+
print("β Config loader test PASSED\n")
|
| 50 |
+
return cfg
|
| 51 |
+
|
| 52 |
+
def test_chunk_processor(df):
|
| 53 |
+
"""Test the chunk processor module."""
|
| 54 |
+
print("=" * 60)
|
| 55 |
+
print("Testing Chunk Processor...")
|
| 56 |
+
print("=" * 60)
|
| 57 |
+
|
| 58 |
+
from retriever.processor import ChunkProcessor
|
| 59 |
+
from config_loader import cfg
|
| 60 |
+
|
| 61 |
+
# Test with first 5 pages only to speed up
|
| 62 |
+
test_df = df.head(5)
|
| 63 |
+
|
| 64 |
+
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
|
| 65 |
+
chunks = proc.process(
|
| 66 |
+
test_df,
|
| 67 |
+
technique=cfg.processing['technique'],
|
| 68 |
+
chunk_size=cfg.processing['chunk_size'],
|
| 69 |
+
chunk_overlap=cfg.processing['chunk_overlap'],
|
| 70 |
+
max_docs=5,
|
| 71 |
+
verbose=False
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
print(f"β Created {len(chunks)} chunks from 5 pages")
|
| 75 |
+
print(f"β Sample chunk ID: {chunks[0]['id']}")
|
| 76 |
+
print(f"β Sample chunk size: {len(chunks[0]['metadata']['text'])} chars")
|
| 77 |
+
|
| 78 |
+
assert len(chunks) > 0, "No chunks created"
|
| 79 |
+
assert 'values' in chunks[0], "Embeddings not generated"
|
| 80 |
+
assert len(chunks[0]['values']) == 384, f"Expected 384 dimensions, got {len(chunks[0]['values'])}"
|
| 81 |
+
print("β Chunk processor test PASSED\n")
|
| 82 |
+
return chunks
|
| 83 |
+
|
| 84 |
+
def test_generator():
|
| 85 |
+
"""Test the RAG generator module."""
|
| 86 |
+
print("=" * 60)
|
| 87 |
+
print("Testing RAG Generator...")
|
| 88 |
+
print("=" * 60)
|
| 89 |
+
|
| 90 |
+
from retriever.generator import RAGGenerator
|
| 91 |
+
|
| 92 |
+
generator = RAGGenerator()
|
| 93 |
+
|
| 94 |
+
# Test prompt generation
|
| 95 |
+
test_contexts = [
|
| 96 |
+
"CBT is a form of psychotherapy developed by Aaron Beck.",
|
| 97 |
+
"The cognitive model proposes that dysfunctional thinking influences mood and behavior."
|
| 98 |
+
]
|
| 99 |
+
test_query = "What is CBT?"
|
| 100 |
+
|
| 101 |
+
prompt = generator.generate_prompt(test_query, test_contexts)
|
| 102 |
+
|
| 103 |
+
print(f"β Generated prompt length: {len(prompt)} chars")
|
| 104 |
+
print(f"β Prompt contains query: {'What is CBT?' in prompt}")
|
| 105 |
+
print(f"β Prompt contains context: {'Aaron Beck' in prompt}")
|
| 106 |
+
|
| 107 |
+
assert "What is CBT?" in prompt, "Query not in prompt"
|
| 108 |
+
assert "Aaron Beck" in prompt, "Context not in prompt"
|
| 109 |
+
print("β RAG generator test PASSED\n")
|
| 110 |
+
return generator
|
| 111 |
+
|
| 112 |
+
def test_evaluator():
|
| 113 |
+
"""Test the RAG evaluator module."""
|
| 114 |
+
print("=" * 60)
|
| 115 |
+
print("Testing RAG Evaluator...")
|
| 116 |
+
print("=" * 60)
|
| 117 |
+
|
| 118 |
+
from retriever.evaluator import GroqJudge
|
| 119 |
+
|
| 120 |
+
groq_key = os.getenv("GROQ_API_KEY")
|
| 121 |
+
if not groq_key:
|
| 122 |
+
print("β GROQ_API_KEY not set, skipping evaluator test")
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
judge = GroqJudge(api_key=groq_key, model="llama-3.1-8b-instant")
|
| 126 |
+
|
| 127 |
+
# Simple test prompt
|
| 128 |
+
test_prompt = "What is 2 + 2? Answer with just the number."
|
| 129 |
+
response = judge.generate(test_prompt)
|
| 130 |
+
|
| 131 |
+
print(f"β Judge response: {response}")
|
| 132 |
+
print("β Evaluator test PASSED\n")
|
| 133 |
+
return judge
|
| 134 |
+
|
| 135 |
+
def main():
|
| 136 |
+
"""Run all tests."""
|
| 137 |
+
print("\n" + "=" * 60)
|
| 138 |
+
print("CBT RAG SYSTEM - BACKEND TESTS")
|
| 139 |
+
print("=" * 60 + "\n")
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
# Test data loader
|
| 143 |
+
df = test_data_loader()
|
| 144 |
+
|
| 145 |
+
# Test config loader
|
| 146 |
+
cfg = test_config_loader()
|
| 147 |
+
|
| 148 |
+
# Test chunk processor
|
| 149 |
+
chunks = test_chunk_processor(df)
|
| 150 |
+
|
| 151 |
+
# Test generator
|
| 152 |
+
generator = test_generator()
|
| 153 |
+
|
| 154 |
+
# Test evaluator
|
| 155 |
+
judge = test_evaluator()
|
| 156 |
+
|
| 157 |
+
print("=" * 60)
|
| 158 |
+
print("ALL TESTS PASSED!")
|
| 159 |
+
print("=" * 60)
|
| 160 |
+
print("\nBackend is ready for use.")
|
| 161 |
+
print("\nNext steps:")
|
| 162 |
+
print("1. Run 'python ingest.py' to index the book into Pinecone")
|
| 163 |
+
print("2. Run 'python -m uvicorn api:app --host 0.0.0.0 --port 8000' to start the API")
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"\nβ TEST FAILED: {e}")
|
| 167 |
+
import traceback
|
| 168 |
+
traceback.print_exc()
|
| 169 |
+
sys.exit(1)
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
main()
|