SCDM-chatbot / ingest.py
TrizteX's picture
Upload 40 files
31fd087 verified
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple
import fitz # PyMuPDF
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
ROOT_DIR = Path(__file__).resolve().parent
PDF_DIR = ROOT_DIR / "data" / "pdf"
INDEX_DIR = ROOT_DIR / "data" / "index"
SOURCE_LINKS_PATH = ROOT_DIR / "data" / "source_links.json"
# Increment this when changing ingest logic so apps can trigger rebuilds
INDEX_VERSION = 3
def load_source_links(path: Path) -> Dict[str, str]:
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def clean_text(text: str) -> str:
# Fix hyphenation and line breaks while preserving paragraph boundaries
# 1) Normalize Windows/Mac line endings
text = text.replace("\r\n", "\n").replace("\r", "\n")
# 2) Remove hyphenation at line breaks: "exam-\nple" -> "example"
text = text.replace("-\n", "")
# 3) Collapse single line breaks inside paragraphs into spaces
lines = text.split("\n")
paragraphs: List[str] = []
current: List[str] = []
for line in lines:
if line.strip() == "":
if current:
paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
current = []
else:
current.append(line)
if current:
paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
return "\n\n".join(p.strip() for p in paragraphs if p.strip())
NOISE_SECTION_KEYWORDS = {
"table of contents",
"contents",
"references",
"bibliography",
"glossary",
"acknowledgements",
"acknowledgments",
"foreword",
"index",
"list of figures",
"list of tables",
}
def looks_like_toc_or_index(text: str) -> bool:
if not text:
return False
# Many lines with dot leaders followed by page numbers
matches = re.findall(r"\.{2,}\s*\d{1,3}\b", text)
return len(matches) >= 5
def is_noise_page(raw_text: str, page_number: int) -> bool:
t = (raw_text or "").lower()
# Drop first page globally as requested
if page_number == 1:
return True
if any(kw in t for kw in NOISE_SECTION_KEYWORDS):
return True
if looks_like_toc_or_index(raw_text):
return True
return False
def extract_paragraphs_with_pages(pdf_path: Path) -> List[Tuple[int, List[str]]]:
doc = fitz.open(pdf_path)
results: List[Tuple[int, List[str]]] = []
for page_number in range(len(doc)):
page = doc.load_page(page_number)
raw_text = page.get_text("text") or ""
# Skip pages that are likely ToC, Index, References, Glossary, or boilerplate
if is_noise_page(raw_text, page_number + 1):
continue
cleaned = clean_text(raw_text)
# Split paragraphs on double newlines created in clean_text
paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]
results.append((page_number + 1, paragraphs))
return results
def filename_to_title(file_name: str) -> str:
name = file_name.rsplit(".", 1)[0]
return name.replace("_", " ").replace("-", " ")
def load_exclude_pages(path: Path) -> Dict[str, List[int]]:
"""Optional per-file manual page exclusions.
JSON format: {"Some.pdf": [1,2,3], "Other.pdf": [10,11]}
"""
if not path.exists():
return {}
try:
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
# Normalize keys to file names only
norm: Dict[str, List[int]] = {}
for k, v in (data or {}).items():
try:
fname = Path(k).name
nums = [int(x) for x in (v or [])]
norm[fname] = nums
except Exception:
continue
return norm
except Exception:
return {}
def build_index():
if not PDF_DIR.exists():
raise FileNotFoundError(f"PDF directory not found: {PDF_DIR}")
INDEX_DIR.mkdir(parents=True, exist_ok=True)
source_links = load_source_links(SOURCE_LINKS_PATH)
exclude_map = load_exclude_pages(ROOT_DIR / "data" / "exclude_pages.json")
texts: List[str] = []
metadatas: List[Dict] = []
for pdf_file in sorted(PDF_DIR.glob("*.pdf")):
file_name = pdf_file.name
url = source_links.get(file_name, "")
title = filename_to_title(file_name)
para_pages = extract_paragraphs_with_pages(pdf_file)
manual_excludes = set(exclude_map.get(file_name, []))
for page_num, paragraphs in para_pages:
if page_num in manual_excludes:
continue
for paragraph_index, paragraph in enumerate(paragraphs):
# Skip tiny fragments
if len(paragraph) < 40:
continue
texts.append(paragraph)
metadatas.append(
{
"file_name": file_name,
"title": title,
"url": url,
"page": page_num,
"paragraph_index": paragraph_index,
}
)
if not texts:
raise RuntimeError("No text extracted from PDFs. Check PDF parsing.")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
vectorstore.save_local(str(INDEX_DIR))
# Save a small manifest for debugging & UI
manifest = {
"num_texts": len(texts),
"pdf_dir": str(PDF_DIR),
"index_dir": str(INDEX_DIR),
"files_indexed": sorted([p.name for p in PDF_DIR.glob("*.pdf")]),
"index_version": INDEX_VERSION,
"manual_exclusions": exclude_map,
}
with (INDEX_DIR / "manifest.json").open("w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2)
print(f"Index built with {len(texts)} paragraphs. Saved to {INDEX_DIR}.")
if __name__ == "__main__":
build_index()