Ramail Khan commited on
Commit
7b44ae2
Β·
unverified Β·
2 Parent(s): 74dd3286c2979e

Merge pull request #4 from ramailkk/Changed_Ingestion_Logic

Browse files
Files changed (10) hide show
  1. BACKEND_README.md +61 -0
  2. EntireBookCleaned.txt +0 -0
  3. api.py +1 -1
  4. config.yaml +13 -13
  5. data_loader.py +86 -36
  6. ingest.py +221 -0
  7. main.py +115 -25
  8. requirements.txt +1 -0
  9. retriever/processor.py +23 -1
  10. test_backend.py +172 -0
BACKEND_README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CBT RAG System - Backend Documentation
2
+
3
+ ## Overview
4
+
5
+ This is a Retrieval-Augmented Generation (RAG) system for Cognitive Behavioral Therapy (CBT) content. It uses a CBT textbook as the knowledge base and implements hybrid search with re-ranking for accurate question answering.
6
+
7
+ ## Embedding Model: Jina v2 Small
8
+
9
+ **Model**: `jinaai/jina-embeddings-v2-small-en`
10
+
11
+ | Property | Value |
12
+ | ---------- | ------ |
13
+ | Dimensions | 512 |
14
+ | Max Tokens | 8,192 |
15
+ | Parameters | 33M |
16
+ | Disk Size | ~130MB |
17
+
18
+ ## Ingestion: 6 Chunking Techniques (Single Index)
19
+
20
+ The `ingest.py` script processes the CBT book **6 times** with different chunking strategies for the ablation study. All chunks are stored in a **SINGLE Pinecone index** with metadata to differentiate.
21
+
22
+ | # | Technique | Description |
23
+ | --- | --------- | ------------------------------------------------------------------------------------ |
24
+ | 1 | fixed | Fixed-size chunking - splits every N characters (may cut sentences mid-way) |
25
+ | 2 | sentence | Sentence-level chunking - respects sentence boundaries (NLTK) |
26
+ | 3 | paragraph | Paragraph-level chunking - uses natural paragraph breaks |
27
+ | 4 | semantic | Semantic chunking - splits where topic/meaning shifts (embedding similarity) |
28
+ | 5 | recursive | Recursive chunking - hierarchical splitting (paragraphs β†’ sentences β†’ words β†’ chars) |
29
+ | 6 | page | Page-level chunking - uses entire book pages as-is |
30
+
31
+ **Single Pinecone Index:** `cbt-book-recursive`
32
+
33
+ **Metadata:** Each chunk includes `chunking_technique` field for filtering.
34
+
35
+ ## Configuration (`config.yaml`)
36
+
37
+ ```yaml
38
+ processing:
39
+ embedding_model: "jinaai/jina-embeddings-v2-small-en"
40
+ technique: "recursive"
41
+ chunk_size: 1000
42
+ chunk_overlap: 100
43
+
44
+ vector_db:
45
+ base_index_name: "cbt-book"
46
+ dimension: 512
47
+ metric: "cosine"
48
+ ```
49
+
50
+ ## Running the System
51
+
52
+ ```bash
53
+ source venv/bin/activate
54
+ pip install -r requirements.txt
55
+ python ingest.py
56
+ python -m uvicorn api:app --host 0.0.0.0 --port 8000
57
+ ```
58
+
59
+ ## Pinecone Setup
60
+
61
+ Create index `cbt-book-recursive` with 512 dimensions, cosine metric.
EntireBookCleaned.txt ADDED
The diff for this file is too large to render. See raw diff
 
api.py CHANGED
@@ -208,7 +208,7 @@ def startup_event() -> None:
208
  if not hf_token:
209
  raise RuntimeError("HF_TOKEN not found in environment variables")
210
 
211
- index_name = "arxiv-tournament-recursive"
212
  embed_model_name = "all-MiniLM-L6-v2"
213
  project_root = os.path.dirname(os.path.abspath(__file__))
214
  cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))
 
208
  if not hf_token:
209
  raise RuntimeError("HF_TOKEN not found in environment variables")
210
 
211
+ index_name = "cbt-book-recursive"
212
  embed_model_name = "all-MiniLM-L6-v2"
213
  project_root = os.path.dirname(os.path.abspath(__file__))
214
  cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))
config.yaml CHANGED
@@ -1,26 +1,26 @@
1
  # ------------------------------------------------------------------
2
- # RAG TOURNAMENT CONFIGURATION
3
  # ------------------------------------------------------------------
4
 
5
  project:
6
- name: "arxiv-research-rag"
7
- category: "cs.AI"
8
- doc_limit: 5
9
 
10
  processing:
11
  # Embedding model used for both vector db and evaluator similarity
12
- embedding_model: "all-MiniLM-L6-v2"
13
  # Options: sentence, recursive, semantic, fixed
14
- technique: "recursive"
15
- # Token limit for MiniLM is 256; keeping it at 250 for safety
16
- chunk_size: 500
17
- chunk_overlap: 50
18
 
19
  vector_db:
20
- base_index_name: "arxiv-tournament"
21
- dimension: 384
22
  metric: "cosine"
23
- batch_size: 100
24
 
25
  retrieval:
26
  # Options: hybrid, semantic, bm25
@@ -43,4 +43,4 @@ models:
43
  - "Mistral-7B"
44
  - "Qwen-2.5"
45
  - "DeepSeek-V3"
46
- - "TinyAya"
 
1
  # ------------------------------------------------------------------
2
+ # RAG CBT QUESTION-ANSWERING SYSTEM CONFIGURATION
3
  # ------------------------------------------------------------------
4
 
5
  project:
6
+ name: "cbt-rag-system"
7
+ category: "psychology"
8
+ doc_limit: null # Load all pages from the book
9
 
10
  processing:
11
  # Embedding model used for both vector db and evaluator similarity
12
+ embedding_model: "jinaai/jina-embeddings-v2-small-en"
13
  # Options: sentence, recursive, semantic, fixed
14
+ technique: "recursive"
15
+ # Jina supports 8192 tokens (~32k chars), using 1000 chars for better context
16
+ chunk_size: 1000
17
+ chunk_overlap: 100
18
 
19
  vector_db:
20
+ base_index_name: "cbt-book"
21
+ dimension: 512 # Jina outputs 512 dimensions
22
  metric: "cosine"
23
+ batch_size: 50 # Reduced batch size for CPU processing
24
 
25
  retrieval:
26
  # Options: hybrid, semantic, bm25
 
43
  - "Mistral-7B"
44
  - "Qwen-2.5"
45
  - "DeepSeek-V3"
46
+ - "TinyAya"
data_loader.py CHANGED
@@ -1,41 +1,91 @@
1
- import fitz # PyMuPDF
2
- import requests
3
- import io
4
- import arxiv
5
  import pandas as pd
 
6
 
7
- def extract_text_from_url(pdf_url):
8
- """Downloads a PDF and extracts all text."""
 
 
 
 
 
 
 
 
 
 
9
  try:
10
- response = requests.get(pdf_url)
11
- # Open the PDF directly from the byte stream
12
- with fitz.open(stream=io.BytesIO(response.content), filetype="pdf") as doc:
13
- text = ""
14
- for page in doc:
15
- text += page.get_text()
16
- return text.replace('\n', ' ')
17
- except Exception as e:
18
- print(f"Error downloading {pdf_url}: {e}")
19
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- def fetch_arxiv_data(category="cs.AI", limit=5):
22
- client = arxiv.Client()
23
- search = arxiv.Search(
24
- query=f"cat:{category}",
25
- max_results=limit,
26
- sort_by=arxiv.SortCriterion.SubmittedDate
27
- )
28
-
29
- results = []
30
- for r in client.results(search):
31
- print(f"Downloading full text for: {r.title[:50]}...")
32
- full_text = extract_text_from_url(r.pdf_url)
33
 
34
- results.append({
35
- "id": r.entry_id.split('/')[-1],
36
- "title": r.title,
37
- "abstract": r.summary.replace('\n', ' '),
38
- "full_text": full_text, # <--- Main part of the data
39
- "url": r.pdf_url
40
- })
41
- return pd.DataFrame(results)
 
 
 
 
 
 
1
+ import re
 
 
 
2
  import pandas as pd
3
+ from typing import List, Dict, Any
4
 
5
+
6
+ def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame:
7
+ """
8
+ Loads the CBT book from a text file and parses it into documents.
9
+ Each page is treated as a separate document.
10
+
11
+ Args:
12
+ file_path: Path to the cleaned book text file
13
+
14
+ Returns:
15
+ DataFrame with columns: id, title, url, full_text
16
+ """
17
  try:
18
+ with open(file_path, 'r', encoding='utf-8') as f:
19
+ content = f.read()
20
+ except FileNotFoundError:
21
+ raise FileNotFoundError(f"Book file not found: {file_path}")
22
+
23
+ # Split content by page markers
24
+ # Pattern matches "--- Page X ---" or "--- Page X of Y ---"
25
+ page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---'
26
+
27
+ # Split the content into pages
28
+ pages = re.split(page_pattern, content)
29
+
30
+ # pages[0] is content before first page marker (usually empty)
31
+ # Then alternating: page_number, page_content, page_number, page_content...
32
+
33
+ documents = []
34
+ i = 1 # Start from first page number
35
+
36
+ while i < len(pages) - 1:
37
+ page_num = pages[i].strip()
38
+ page_content = pages[i + 1].strip() if i + 1 < len(pages) else ""
39
+
40
+ # Clean up the content - remove excessive whitespace
41
+ page_content = re.sub(r'\n{3,}', '\n\n', page_content)
42
+ page_content = page_content.strip()
43
+
44
+ if page_content: # Only add non-empty pages
45
+ # Extract a title from the first line if possible
46
+ lines = page_content.split('\n')
47
+ title_line = lines[0].strip() if lines else f"Page {page_num}"
48
+
49
+ # Use first meaningful line as title, or default to page number
50
+ if len(title_line) > 10 and len(title_line) < 200:
51
+ title = title_line
52
+ else:
53
+ title = f"CBT Book - Page {page_num}"
54
+
55
+ documents.append({
56
+ "id": f"cbt-page-{page_num}",
57
+ "title": title,
58
+ "url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}",
59
+ "full_text": page_content
60
+ })
61
+
62
+ i += 2 # Move to next page number
63
+
64
+ if not documents:
65
+ raise ValueError("No documents were parsed from the book file")
66
+
67
+ df = pd.DataFrame(documents)
68
+ print(f"Loaded {len(df)} pages from CBT book")
69
+ return df
70
+
71
 
72
+ def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]:
73
+ """
74
+ Get statistics about the loaded book.
75
+
76
+ Args:
77
+ df: DataFrame containing book pages
 
 
 
 
 
 
78
 
79
+ Returns:
80
+ Dictionary with statistics
81
+ """
82
+ total_chars = df['full_text'].str.len().sum()
83
+ avg_chars = df['full_text'].str.len().mean()
84
+
85
+ return {
86
+ "total_pages": len(df),
87
+ "total_characters": total_chars,
88
+ "average_chars_per_page": round(avg_chars, 2),
89
+ "min_chars": df['full_text'].str.len().min(),
90
+ "max_chars": df['full_text'].str.len().max()
91
+ }
ingest.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to ingest CBT book data into Pinecone vector database.
3
+ Ingests the book 6 times with different chunking formats for ablation study.
4
+ All chunks are stored in a SINGLE index with metadata to differentiate.
5
+ Run this once before starting the API server.
6
+ """
7
+ import os
8
+ import time
9
+ from dotenv import load_dotenv
10
+ from config_loader import cfg
11
+ from data_loader import load_cbt_book, get_book_stats
12
+ from vector_db import get_pinecone_index, refresh_pinecone_index
13
+ from retriever.processor import ChunkProcessor
14
+
15
+
16
+ # 6 different chunking techniques for ablation study
17
+ CHUNKING_TECHNIQUES = [
18
+ {
19
+ "name": "fixed",
20
+ "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
21
+ "chunk_size": 1000,
22
+ "chunk_overlap": 100,
23
+ "kwargs": {"separator": ""}, # No separator for fixed splitting
24
+ },
25
+ {
26
+ "name": "sentence",
27
+ "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
+ "chunk_size": 2400,
29
+ "chunk_overlap": 100,
30
+ "kwargs": {},
31
+ },
32
+ {
33
+ "name": "paragraph",
34
+ "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
+ "chunk_size": 4000,
36
+ "chunk_overlap": 100,
37
+ "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
+ },
39
+ {
40
+ "name": "semantic",
41
+ "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
42
+ "chunk_size": 2000,
43
+ "chunk_overlap": 100,
44
+ "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
45
+ },
46
+ {
47
+ "name": "recursive",
48
+ "description": "Recursive chunking - hierarchical splitting (paragraphs β†’ sentences β†’ words β†’ chars)",
49
+ "chunk_size": 2000,
50
+ "chunk_overlap": 100,
51
+ "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
52
+ },
53
+ {
54
+ "name": "page",
55
+ "description": "Page-level chunking - uses entire book pages as-is",
56
+ "chunk_size": 10000, # Very large to keep full pages
57
+ "chunk_overlap": 0, # No overlap between pages
58
+ "kwargs": {"separator": "--- Page"}, # Split on page markers
59
+ },
60
+ ]
61
+
62
+
63
+ def ingest_single_technique(
64
+ raw_data,
65
+ proc,
66
+ technique_config,
67
+ technique_index,
68
+ total_techniques,
69
+ ):
70
+ """Chunk the book using a single technique and return chunks with metadata."""
71
+ technique_name = technique_config["name"]
72
+ chunk_size = technique_config["chunk_size"]
73
+ chunk_overlap = technique_config["chunk_overlap"]
74
+ kwargs = technique_config.get("kwargs", {})
75
+
76
+ print(f"\n[{technique_index}/{total_techniques}] Processing '{technique_name}'...")
77
+ print(f" Description: {technique_config['description']}")
78
+ print(f" Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
79
+
80
+ # Chunk and embed
81
+ final_chunks = proc.process(
82
+ raw_data,
83
+ technique=technique_name,
84
+ chunk_size=chunk_size,
85
+ chunk_overlap=chunk_overlap,
86
+ max_docs=cfg.project.get("doc_limit"),
87
+ verbose=False,
88
+ **kwargs,
89
+ )
90
+
91
+ # Add technique metadata to each chunk for differentiation
92
+ # Prefix ID with technique name to ensure uniqueness across techniques
93
+ for chunk in final_chunks:
94
+ chunk["metadata"]["chunking_technique"] = technique_name
95
+ chunk["id"] = f"{technique_name}-{chunk['id']}"
96
+
97
+ print(f" Created {len(final_chunks)} chunks")
98
+
99
+ return final_chunks
100
+
101
+
102
+ def ingest_data():
103
+ """Load CBT book, chunk it 6 ways, and upload ALL to a SINGLE Pinecone index."""
104
+ load_dotenv()
105
+
106
+ pinecone_key = os.getenv("PINECONE_API_KEY")
107
+ if not pinecone_key:
108
+ raise RuntimeError("PINECONE_API_KEY not found in environment variables")
109
+
110
+ print("=" * 80)
111
+ print("CBT BOOK INGESTION PIPELINE - 6 TECHNIQUES (SINGLE INDEX)")
112
+ print("=" * 80)
113
+ print(f"\nTechniques to process: {len(CHUNKING_TECHNIQUES)}")
114
+ for i, tech in enumerate(CHUNKING_TECHNIQUES, 1):
115
+ print(f" {i}. {tech['name']}: {tech['description']}")
116
+ print(f"\nAll chunks will be stored in a SINGLE index: {cfg.db['base_index_name']}-{cfg.processing['technique']}")
117
+ print("Chunks are differentiated by 'chunking_technique' metadata field.")
118
+
119
+ # 1. Load the CBT book (once, reused for all techniques)
120
+ print(f"\n{'='*80}")
121
+ print("STEP 1: LOADING CBT BOOK")
122
+ print(f"{'='*80}")
123
+ print("\nLoading CBT book from EntireBookCleaned.txt...")
124
+ raw_data = load_cbt_book("EntireBookCleaned.txt")
125
+ stats = get_book_stats(raw_data)
126
+ print(f" Loaded {stats['total_pages']} pages")
127
+ print(f" Total characters: {stats['total_characters']:,}")
128
+ print(f" Average chars per page: {stats['average_chars_per_page']:.0f}")
129
+
130
+ # 2. Initialize processor (once, reused for all techniques)
131
+ print(f"\nInitializing embedding model: {cfg.processing['embedding_model']}")
132
+ proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
133
+
134
+ # 3. Process each technique sequentially and collect all chunks
135
+ print(f"\n{'='*80}")
136
+ print("STEP 2: CHUNKING WITH 6 TECHNIQUES")
137
+ print(f"{'='*80}")
138
+
139
+ all_chunks = []
140
+ results = {}
141
+
142
+ for i, technique in enumerate(CHUNKING_TECHNIQUES, 1):
143
+ try:
144
+ chunks = ingest_single_technique(
145
+ raw_data=raw_data,
146
+ proc=proc,
147
+ technique_config=technique,
148
+ technique_index=i,
149
+ total_techniques=len(CHUNKING_TECHNIQUES),
150
+ )
151
+ all_chunks.extend(chunks)
152
+ results[technique["name"]] = {
153
+ "status": "success",
154
+ "chunks": len(chunks),
155
+ }
156
+
157
+ # Wait between techniques to avoid rate limits (for embedding API)
158
+ if i < len(CHUNKING_TECHNIQUES):
159
+ print(f" Waiting 5 seconds before next technique (rate limit protection)...")
160
+ time.sleep(5)
161
+
162
+ except Exception as e:
163
+ print(f" ERROR with technique '{technique['name']}': {e}")
164
+ results[technique["name"]] = {
165
+ "status": "failed",
166
+ "error": str(e),
167
+ }
168
+
169
+ # 4. Upload ALL chunks to a SINGLE Pinecone index
170
+ print(f"\n{'='*80}")
171
+ print("STEP 3: UPLOADING TO SINGLE PINECONE INDEX")
172
+ print(f"{'='*80}")
173
+
174
+ index_name = f"{cfg.db['base_index_name']}-{cfg.processing['technique']}"
175
+ print(f"\nIndex name: {index_name}")
176
+ print(f"Dimension: {cfg.db['dimension']}")
177
+ print(f"Metric: {cfg.db['metric']}")
178
+ print(f"Total chunks to upload: {len(all_chunks)}")
179
+
180
+ index = get_pinecone_index(
181
+ pinecone_key,
182
+ cfg.db['base_index_name'],
183
+ technique=cfg.processing['technique'],
184
+ dimension=cfg.db['dimension'],
185
+ metric=cfg.db['metric'],
186
+ )
187
+
188
+ print(f"\nUploading {len(all_chunks)} vectors to Pinecone...")
189
+ refresh_pinecone_index(index, all_chunks, batch_size=cfg.db['batch_size'])
190
+
191
+ # 5. Summary
192
+ print(f"\n{'='*80}")
193
+ print("INGESTION COMPLETE - SUMMARY")
194
+ print(f"{'='*80}")
195
+ print(f"\n{'Technique':<15} {'Status':<12} {'Chunks':<10}")
196
+ print("-" * 40)
197
+ total_chunks = 0
198
+ for tech in CHUNKING_TECHNIQUES:
199
+ name = tech["name"]
200
+ result = results.get(name, {})
201
+ status = result.get("status", "unknown")
202
+ chunks = result.get("chunks", 0)
203
+ if status == "success":
204
+ total_chunks += chunks
205
+ print(f"{name:<15} {status:<12} {chunks:<10}")
206
+ print("-" * 40)
207
+ print(f"{'TOTAL':<15} {'':<12} {total_chunks:<10}")
208
+
209
+ print(f"\nSingle index: {index_name}")
210
+ print(f"Total vectors: {len(all_chunks)}")
211
+ print("\nChunks can be filtered by 'chunking_technique' metadata field:")
212
+ for tech in CHUNKING_TECHNIQUES:
213
+ if results.get(tech["name"], {}).get("status") == "success":
214
+ print(f" - chunking_technique: '{tech['name']}'")
215
+
216
+ print("\nYou can now start the API server with:")
217
+ print(" python -m uvicorn api:app --host 0.0.0.0 --port 8000")
218
+
219
+
220
+ if __name__ == "__main__":
221
+ ingest_data()
main.py CHANGED
@@ -1,15 +1,15 @@
1
  import os
2
  from dotenv import load_dotenv
3
- from config_loader import cfg # Import the Mother Config
4
 
5
  from vector_db import get_pinecone_index, refresh_pinecone_index
6
  from retriever.retriever import HybridRetriever
7
  from retriever.generator import RAGGenerator
8
  from retriever.processor import ChunkProcessor
9
  from retriever.evaluator import RAGEvaluator
10
- import data_loader as dl
11
 
12
- # Import fleet mapping
13
  from models.llama_3_8b import Llama3_8B
14
  from models.mistral_7b import Mistral_7b
15
  from models.qwen_2_5 import Qwen2_5
@@ -26,16 +26,36 @@ MODEL_MAP = {
26
 
27
  load_dotenv()
28
 
 
29
  def main():
 
30
  hf_token = os.getenv("HF_TOKEN")
31
  pinecone_key = os.getenv("PINECONE_API_KEY")
32
- query = "How do transformers handle long sequences?"
33
 
34
- # 1. Data Ingestion (Controlled by Config)
35
- raw_data = dl.fetch_arxiv_data(
36
- category=cfg.project['category'],
37
- limit=cfg.project['doc_limit']
38
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # 2. Chunking & Embedding
41
  proc = ChunkProcessor(model_name=cfg.processing['embedding_model'])
@@ -43,20 +63,38 @@ def main():
43
  raw_data,
44
  technique=cfg.processing['technique'],
45
  chunk_size=cfg.processing['chunk_size'],
46
- chunk_overlap=cfg.processing['chunk_overlap']
 
 
47
  )
48
 
49
- # 3. Vector DB (Auto-names index based on technique)
 
 
 
 
 
 
50
  index = get_pinecone_index(
51
- pinecone_key,
52
- cfg.db['base_index_name'],
53
  technique=cfg.processing['technique'],
54
  dimension=cfg.db['dimension']
55
  )
56
  refresh_pinecone_index(index, final_chunks, batch_size=cfg.db['batch_size'])
57
 
58
- # 4. Retrieval
 
 
 
 
59
  retriever = HybridRetriever(final_chunks, proc.encoder)
 
 
 
 
 
 
60
  context_chunks = retriever.search(
61
  query, index,
62
  mode=cfg.retrieval['mode'],
@@ -66,44 +104,96 @@ def main():
66
  final_k=cfg.retrieval['final_k']
67
  )
68
 
69
- # 5. Initialization of Contestants
 
 
 
 
 
 
70
  rag_engine = RAGGenerator()
71
  models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
72
-
73
- # Setup Evaluator with the designated Judge
74
 
 
75
  evaluator = RAGEvaluator(
76
- judge_model=cfg.gen['judge_model'],
77
- embedding_model=proc.encoder,
78
- api_key=os.getenv("GROQ_API_KEY")
79
  )
80
 
81
  tournament_results = {}
82
 
83
- # 6. Tournament Loop
84
  for name, model_inst in models.items():
85
- print(f"\n--- Processing {name} ---")
 
 
86
  try:
87
  # Generation
88
  answer = rag_engine.get_answer(
89
- model_inst, query, context_chunks,
90
  temperature=cfg.gen['temperature']
91
  )
92
-
 
 
 
93
  # Faithfulness Evaluation
94
  faith = evaluator.evaluate_faithfulness(answer, context_chunks)
95
  # Relevancy Evaluation
96
  rel = evaluator.evaluate_relevancy(query, answer)
97
 
98
  tournament_results[name] = {
 
99
  "Faithfulness": faith['score'],
100
  "Relevancy": rel['score'],
101
  "Claims": faith['details']
102
  }
 
 
 
 
 
103
  except Exception as e:
104
  print(f"Error evaluating {name}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- # 7. Final Output (Omitted for brevity, use your existing report logic)
107
 
108
  if __name__ == "__main__":
109
  main()
 
1
  import os
2
  from dotenv import load_dotenv
3
+ from config_loader import cfg
4
 
5
  from vector_db import get_pinecone_index, refresh_pinecone_index
6
  from retriever.retriever import HybridRetriever
7
  from retriever.generator import RAGGenerator
8
  from retriever.processor import ChunkProcessor
9
  from retriever.evaluator import RAGEvaluator
10
+ from data_loader import load_cbt_book, get_book_stats
11
 
12
+ # Import model fleet
13
  from models.llama_3_8b import Llama3_8B
14
  from models.mistral_7b import Mistral_7b
15
  from models.qwen_2_5 import Qwen2_5
 
26
 
27
  load_dotenv()
28
 
29
+
30
  def main():
31
+ """Main function to run the RAG tournament on CBT book."""
32
  hf_token = os.getenv("HF_TOKEN")
33
  pinecone_key = os.getenv("PINECONE_API_KEY")
34
+ groq_key = os.getenv("GROQ_API_KEY")
35
 
36
+ # Verify environment variables
37
+ if not hf_token:
38
+ raise RuntimeError("HF_TOKEN not found in environment variables")
39
+ if not pinecone_key:
40
+ raise RuntimeError("PINECONE_API_KEY not found in environment variables")
41
+ if not groq_key:
42
+ raise RuntimeError("GROQ_API_KEY not found in environment variables")
43
+
44
+ # Example query for testing
45
+ query = "What is cognitive behavior therapy and how does it work?"
46
+
47
+ print("=" * 80)
48
+ print("CBT RAG SYSTEM - LOADING DATA")
49
+ print("=" * 80)
50
+
51
+ # 1. Data Ingestion - Load CBT Book
52
+ raw_data = load_cbt_book("EntireBookCleaned.txt")
53
+ stats = get_book_stats(raw_data)
54
+ print(f"Book Statistics: {stats}")
55
+
56
+ print("\n" + "=" * 80)
57
+ print("CHUNKING AND EMBEDDING")
58
+ print("=" * 80)
59
 
60
  # 2. Chunking & Embedding
61
  proc = ChunkProcessor(model_name=cfg.processing['embedding_model'])
 
63
  raw_data,
64
  technique=cfg.processing['technique'],
65
  chunk_size=cfg.processing['chunk_size'],
66
+ chunk_overlap=cfg.processing['chunk_overlap'],
67
+ max_docs=cfg.project.get('doc_limit'), # None means load all
68
+ verbose=True
69
  )
70
 
71
+ print(f"\nTotal chunks created: {len(final_chunks)}")
72
+
73
+ print("\n" + "=" * 80)
74
+ print("VECTOR DATABASE SETUP")
75
+ print("=" * 80)
76
+
77
+ # 3. Vector DB - Create/Update Pinecone Index
78
  index = get_pinecone_index(
79
+ pinecone_key,
80
+ cfg.db['base_index_name'],
81
  technique=cfg.processing['technique'],
82
  dimension=cfg.db['dimension']
83
  )
84
  refresh_pinecone_index(index, final_chunks, batch_size=cfg.db['batch_size'])
85
 
86
+ print("\n" + "=" * 80)
87
+ print("RETRIEVAL SETUP")
88
+ print("=" * 80)
89
+
90
+ # 4. Retrieval Setup
91
  retriever = HybridRetriever(final_chunks, proc.encoder)
92
+
93
+ print("\n" + "=" * 80)
94
+ print(f"TESTING QUERY: {query}")
95
+ print("=" * 80)
96
+
97
+ # Test retrieval
98
  context_chunks = retriever.search(
99
  query, index,
100
  mode=cfg.retrieval['mode'],
 
104
  final_k=cfg.retrieval['final_k']
105
  )
106
 
107
+ print(f"\nRetrieved {len(context_chunks)} context chunks")
108
+
109
+ print("\n" + "=" * 80)
110
+ print("MODEL TOURNAMENT")
111
+ print("=" * 80)
112
+
113
+ # 5. Initialize Models
114
  rag_engine = RAGGenerator()
115
  models = {name: MODEL_MAP[name](token=hf_token) for name in cfg.model_list}
 
 
116
 
117
+ # 6. Setup Evaluator with Judge
118
  evaluator = RAGEvaluator(
119
+ judge_model=cfg.gen['judge_model'],
120
+ embedding_model=proc.encoder,
121
+ api_key=groq_key
122
  )
123
 
124
  tournament_results = {}
125
 
126
+ # 7. Tournament Loop
127
  for name, model_inst in models.items():
128
+ print(f"\n{'='*60}")
129
+ print(f"Processing {name}")
130
+ print('='*60)
131
  try:
132
  # Generation
133
  answer = rag_engine.get_answer(
134
+ model_inst, query, context_chunks,
135
  temperature=cfg.gen['temperature']
136
  )
137
+
138
+ print(f"\nAnswer from {name}:")
139
+ print(answer[:500] + "..." if len(answer) > 500 else answer)
140
+
141
  # Faithfulness Evaluation
142
  faith = evaluator.evaluate_faithfulness(answer, context_chunks)
143
  # Relevancy Evaluation
144
  rel = evaluator.evaluate_relevancy(query, answer)
145
 
146
  tournament_results[name] = {
147
+ "answer": answer,
148
  "Faithfulness": faith['score'],
149
  "Relevancy": rel['score'],
150
  "Claims": faith['details']
151
  }
152
+
153
+ print(f"\n{name} Results:")
154
+ print(f" Faithfulness: {faith['score']:.1f}%")
155
+ print(f" Relevancy: {rel['score']:.3f}")
156
+
157
  except Exception as e:
158
  print(f"Error evaluating {name}: {e}")
159
+ tournament_results[name] = {
160
+ "answer": "",
161
+ "Faithfulness": 0,
162
+ "Relevancy": 0,
163
+ "Claims": [],
164
+ "error": str(e)
165
+ }
166
+
167
+ # 8. Final Results Summary
168
+ print("\n" + "=" * 80)
169
+ print("TOURNAMENT RESULTS SUMMARY")
170
+ print("=" * 80)
171
+
172
+ print(f"\nQuery: {query}")
173
+ print(f"\nRetrieved Context Chunks: {len(context_chunks)}")
174
+ print("\n" + "-" * 60)
175
+ print(f"{'Model':<20} {'Faithfulness':>15} {'Relevancy':>15}")
176
+ print("-" * 60)
177
+
178
+ for name, results in tournament_results.items():
179
+ faith = results.get('Faithfulness', 0)
180
+ rel = results.get('Relevancy', 0)
181
+ print(f"{name:<20} {faith:>14.1f}% {rel:>15.3f}")
182
+
183
+ print("-" * 60)
184
+
185
+ # Find best model
186
+ if tournament_results:
187
+ best_model = max(
188
+ tournament_results.items(),
189
+ key=lambda x: x[1].get('Faithfulness', 0) + x[1].get('Relevancy', 0)
190
+ )
191
+ print(f"\nBest Overall Model: {best_model[0]}")
192
+ print(f" Faithfulness: {best_model[1]['Faithfulness']:.1f}%")
193
+ print(f" Relevancy: {best_model[1]['Relevancy']:.3f}")
194
+
195
+ return tournament_results
196
 
 
197
 
198
  if __name__ == "__main__":
199
  main()
requirements.txt CHANGED
@@ -93,3 +93,4 @@ uuid_utils==0.14.1
93
  xxhash==3.6.0
94
  yarl==1.23.0
95
  zstandard==0.25.0
 
 
93
  xxhash==3.6.0
94
  yarl==1.23.0
95
  zstandard==0.25.0
96
+ groq==0.13.0
retriever/processor.py CHANGED
@@ -37,8 +37,10 @@ class ChunkProcessor:
37
  - "fixed": Character-based, may split mid-sentence
38
  - "recursive": Recursive character splitting with hierarchical separators
39
  - "character": Character-based splitting on paragraph boundaries
 
40
  - "sentence": Sliding window over NLTK sentences
41
  - "semantic": Embedding-based semantic chunking
 
42
  """
43
  if technique == "fixed":
44
  return CharacterTextSplitter(
@@ -67,6 +69,16 @@ class ChunkProcessor:
67
  is_separator_regex=False
68
  )
69
 
 
 
 
 
 
 
 
 
 
 
70
  elif technique == "sentence":
71
  # sentence-level chunking using NLTK
72
  return NLTKTextSplitter(
@@ -83,8 +95,18 @@ class ChunkProcessor:
83
  breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
84
  )
85
 
 
 
 
 
 
 
 
 
 
 
86
  else:
87
- raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, sentence, semantic")
88
 
89
  # ------------------------------------------------------------------
90
  # Processing
 
37
  - "fixed": Character-based, may split mid-sentence
38
  - "recursive": Recursive character splitting with hierarchical separators
39
  - "character": Character-based splitting on paragraph boundaries
40
+ - "paragraph": Paragraph-level splitting on \\n\\n boundaries
41
  - "sentence": Sliding window over NLTK sentences
42
  - "semantic": Embedding-based semantic chunking
43
+ - "page": Page-level splitting on page markers
44
  """
45
  if technique == "fixed":
46
  return CharacterTextSplitter(
 
69
  is_separator_regex=False
70
  )
71
 
72
+ elif technique == "paragraph":
73
+ # Paragraph-level chunking using paragraph breaks
74
+ return CharacterTextSplitter(
75
+ separator=kwargs.get('separator', "\n\n"),
76
+ chunk_size=chunk_size,
77
+ chunk_overlap=chunk_overlap,
78
+ length_function=len,
79
+ is_separator_regex=False
80
+ )
81
+
82
  elif technique == "sentence":
83
  # sentence-level chunking using NLTK
84
  return NLTKTextSplitter(
 
95
  breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
96
  )
97
 
98
+ elif technique == "page":
99
+ # Page-level chunking using page markers
100
+ return CharacterTextSplitter(
101
+ separator=kwargs.get('separator', "--- Page"),
102
+ chunk_size=chunk_size,
103
+ chunk_overlap=chunk_overlap,
104
+ length_function=len,
105
+ is_separator_regex=False
106
+ )
107
+
108
  else:
109
+ raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, paragraph, sentence, semantic, page")
110
 
111
  # ------------------------------------------------------------------
112
  # Processing
test_backend.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple test script to verify backend components work correctly.
3
+ """
4
+ import os
5
+ import sys
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ def test_data_loader():
12
+ """Test the data loader module."""
13
+ print("=" * 60)
14
+ print("Testing Data Loader...")
15
+ print("=" * 60)
16
+
17
+ from data_loader import load_cbt_book, get_book_stats
18
+
19
+ df = load_cbt_book("EntireBookCleaned.txt")
20
+ stats = get_book_stats(df)
21
+
22
+ print(f"βœ“ Loaded {stats['total_pages']} pages")
23
+ print(f"βœ“ Total characters: {stats['total_characters']:,}")
24
+ print(f"βœ“ Average chars per page: {stats['average_chars_per_page']:.0f}")
25
+
26
+ assert stats['total_pages'] > 0, "No pages loaded"
27
+ print("βœ“ Data loader test PASSED\n")
28
+ return df
29
+
30
+ def test_config_loader():
31
+ """Test the config loader module."""
32
+ print("=" * 60)
33
+ print("Testing Config Loader...")
34
+ print("=" * 60)
35
+
36
+ from config_loader import cfg
37
+
38
+ print(f"βœ“ Project name: {cfg.project['name']}")
39
+ print(f"βœ“ Embedding model: {cfg.processing['embedding_model']}")
40
+ print(f"βœ“ Chunking technique: {cfg.processing['technique']}")
41
+ print(f"βœ“ Chunk size: {cfg.processing['chunk_size']}")
42
+ print(f"βœ“ Vector DB index: {cfg.db['base_index_name']}")
43
+ print(f"βœ“ Retrieval mode: {cfg.retrieval['mode']}")
44
+ print(f"βœ“ Rerank strategy: {cfg.retrieval['rerank_strategy']}")
45
+ print(f"βœ“ Models to evaluate: {cfg.model_list}")
46
+
47
+ assert cfg.project['name'] == "cbt-rag-system", "Project name mismatch"
48
+ assert cfg.db['base_index_name'] == "cbt-book", "Index name mismatch"
49
+ print("βœ“ Config loader test PASSED\n")
50
+ return cfg
51
+
52
+ def test_chunk_processor(df):
53
+ """Test the chunk processor module."""
54
+ print("=" * 60)
55
+ print("Testing Chunk Processor...")
56
+ print("=" * 60)
57
+
58
+ from retriever.processor import ChunkProcessor
59
+ from config_loader import cfg
60
+
61
+ # Test with first 5 pages only to speed up
62
+ test_df = df.head(5)
63
+
64
+ proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
65
+ chunks = proc.process(
66
+ test_df,
67
+ technique=cfg.processing['technique'],
68
+ chunk_size=cfg.processing['chunk_size'],
69
+ chunk_overlap=cfg.processing['chunk_overlap'],
70
+ max_docs=5,
71
+ verbose=False
72
+ )
73
+
74
+ print(f"βœ“ Created {len(chunks)} chunks from 5 pages")
75
+ print(f"βœ“ Sample chunk ID: {chunks[0]['id']}")
76
+ print(f"βœ“ Sample chunk size: {len(chunks[0]['metadata']['text'])} chars")
77
+
78
+ assert len(chunks) > 0, "No chunks created"
79
+ assert 'values' in chunks[0], "Embeddings not generated"
80
+ assert len(chunks[0]['values']) == 384, f"Expected 384 dimensions, got {len(chunks[0]['values'])}"
81
+ print("βœ“ Chunk processor test PASSED\n")
82
+ return chunks
83
+
84
+ def test_generator():
85
+ """Test the RAG generator module."""
86
+ print("=" * 60)
87
+ print("Testing RAG Generator...")
88
+ print("=" * 60)
89
+
90
+ from retriever.generator import RAGGenerator
91
+
92
+ generator = RAGGenerator()
93
+
94
+ # Test prompt generation
95
+ test_contexts = [
96
+ "CBT is a form of psychotherapy developed by Aaron Beck.",
97
+ "The cognitive model proposes that dysfunctional thinking influences mood and behavior."
98
+ ]
99
+ test_query = "What is CBT?"
100
+
101
+ prompt = generator.generate_prompt(test_query, test_contexts)
102
+
103
+ print(f"βœ“ Generated prompt length: {len(prompt)} chars")
104
+ print(f"βœ“ Prompt contains query: {'What is CBT?' in prompt}")
105
+ print(f"βœ“ Prompt contains context: {'Aaron Beck' in prompt}")
106
+
107
+ assert "What is CBT?" in prompt, "Query not in prompt"
108
+ assert "Aaron Beck" in prompt, "Context not in prompt"
109
+ print("βœ“ RAG generator test PASSED\n")
110
+ return generator
111
+
112
+ def test_evaluator():
113
+ """Test the RAG evaluator module."""
114
+ print("=" * 60)
115
+ print("Testing RAG Evaluator...")
116
+ print("=" * 60)
117
+
118
+ from retriever.evaluator import GroqJudge
119
+
120
+ groq_key = os.getenv("GROQ_API_KEY")
121
+ if not groq_key:
122
+ print("⚠ GROQ_API_KEY not set, skipping evaluator test")
123
+ return None
124
+
125
+ judge = GroqJudge(api_key=groq_key, model="llama-3.1-8b-instant")
126
+
127
+ # Simple test prompt
128
+ test_prompt = "What is 2 + 2? Answer with just the number."
129
+ response = judge.generate(test_prompt)
130
+
131
+ print(f"βœ“ Judge response: {response}")
132
+ print("βœ“ Evaluator test PASSED\n")
133
+ return judge
134
+
135
+ def main():
136
+ """Run all tests."""
137
+ print("\n" + "=" * 60)
138
+ print("CBT RAG SYSTEM - BACKEND TESTS")
139
+ print("=" * 60 + "\n")
140
+
141
+ try:
142
+ # Test data loader
143
+ df = test_data_loader()
144
+
145
+ # Test config loader
146
+ cfg = test_config_loader()
147
+
148
+ # Test chunk processor
149
+ chunks = test_chunk_processor(df)
150
+
151
+ # Test generator
152
+ generator = test_generator()
153
+
154
+ # Test evaluator
155
+ judge = test_evaluator()
156
+
157
+ print("=" * 60)
158
+ print("ALL TESTS PASSED!")
159
+ print("=" * 60)
160
+ print("\nBackend is ready for use.")
161
+ print("\nNext steps:")
162
+ print("1. Run 'python ingest.py' to index the book into Pinecone")
163
+ print("2. Run 'python -m uvicorn api:app --host 0.0.0.0 --port 8000' to start the API")
164
+
165
+ except Exception as e:
166
+ print(f"\nβœ— TEST FAILED: {e}")
167
+ import traceback
168
+ traceback.print_exc()
169
+ sys.exit(1)
170
+
171
+ if __name__ == "__main__":
172
+ main()