Spaces:

TrizteX
/

SCDM-chatbot

Sleeping

App Files Files Community

SCDM-chatbot / ingest.py

TrizteX

Upload 40 files

31fd087 verified 6 months ago

raw

history blame contribute delete

6.07 kB

	import json
	import os
	import re
	from pathlib import Path
	from typing import Dict, List, Tuple

	import fitz # PyMuPDF
	from langchain_community.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings


	ROOT_DIR = Path(__file__).resolve().parent
	PDF_DIR = ROOT_DIR / "data" / "pdf"
	INDEX_DIR = ROOT_DIR / "data" / "index"
	SOURCE_LINKS_PATH = ROOT_DIR / "data" / "source_links.json"

	# Increment this when changing ingest logic so apps can trigger rebuilds
	INDEX_VERSION = 3


	def load_source_links(path: Path) -> Dict[str, str]:
	with path.open("r", encoding="utf-8") as f:
	return json.load(f)


	def clean_text(text: str) -> str:
	# Fix hyphenation and line breaks while preserving paragraph boundaries
	# 1) Normalize Windows/Mac line endings
	text = text.replace("\r\n", "\n").replace("\r", "\n")
	# 2) Remove hyphenation at line breaks: "exam-\nple" -> "example"
	text = text.replace("-\n", "")
	# 3) Collapse single line breaks inside paragraphs into spaces
	lines = text.split("\n")
	paragraphs: List[str] = []
	current: List[str] = []
	for line in lines:
	if line.strip() == "":
	if current:
	paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
	current = []
	else:
	current.append(line)
	if current:
	paragraphs.append(" ".join(s.strip() for s in current if s.strip()))
	return "\n\n".join(p.strip() for p in paragraphs if p.strip())


	NOISE_SECTION_KEYWORDS = {
	"table of contents",
	"contents",
	"references",
	"bibliography",
	"glossary",
	"acknowledgements",
	"acknowledgments",
	"foreword",
	"index",
	"list of figures",
	"list of tables",
	}


	def looks_like_toc_or_index(text: str) -> bool:
	if not text:
	return False
	# Many lines with dot leaders followed by page numbers
	matches = re.findall(r"\.{2,}\s*\d{1,3}\b", text)
	return len(matches) >= 5


	def is_noise_page(raw_text: str, page_number: int) -> bool:
	t = (raw_text or "").lower()
	# Drop first page globally as requested
	if page_number == 1:
	return True
	if any(kw in t for kw in NOISE_SECTION_KEYWORDS):
	return True
	if looks_like_toc_or_index(raw_text):
	return True
	return False


	def extract_paragraphs_with_pages(pdf_path: Path) -> List[Tuple[int, List[str]]]:
	doc = fitz.open(pdf_path)
	results: List[Tuple[int, List[str]]] = []
	for page_number in range(len(doc)):
	page = doc.load_page(page_number)
	raw_text = page.get_text("text") or ""
	# Skip pages that are likely ToC, Index, References, Glossary, or boilerplate
	if is_noise_page(raw_text, page_number + 1):
	continue
	cleaned = clean_text(raw_text)
	# Split paragraphs on double newlines created in clean_text
	paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]
	results.append((page_number + 1, paragraphs))
	return results


	def filename_to_title(file_name: str) -> str:
	name = file_name.rsplit(".", 1)[0]
	return name.replace("_", " ").replace("-", " ")


	def load_exclude_pages(path: Path) -> Dict[str, List[int]]:
	"""Optional per-file manual page exclusions.
	JSON format: {"Some.pdf": [1,2,3], "Other.pdf": [10,11]}
	"""
	if not path.exists():
	return {}
	try:
	with path.open("r", encoding="utf-8") as f:
	data = json.load(f)
	# Normalize keys to file names only
	norm: Dict[str, List[int]] = {}
	for k, v in (data or {}).items():
	try:
	fname = Path(k).name
	nums = [int(x) for x in (v or [])]
	norm[fname] = nums
	except Exception:
	continue
	return norm
	except Exception:
	return {}


	def build_index():
	if not PDF_DIR.exists():
	raise FileNotFoundError(f"PDF directory not found: {PDF_DIR}")
	INDEX_DIR.mkdir(parents=True, exist_ok=True)

	source_links = load_source_links(SOURCE_LINKS_PATH)
	exclude_map = load_exclude_pages(ROOT_DIR / "data" / "exclude_pages.json")

	texts: List[str] = []
	metadatas: List[Dict] = []

	for pdf_file in sorted(PDF_DIR.glob("*.pdf")):
	file_name = pdf_file.name
	url = source_links.get(file_name, "")
	title = filename_to_title(file_name)

	para_pages = extract_paragraphs_with_pages(pdf_file)
	manual_excludes = set(exclude_map.get(file_name, []))
	for page_num, paragraphs in para_pages:
	if page_num in manual_excludes:
	continue
	for paragraph_index, paragraph in enumerate(paragraphs):
	# Skip tiny fragments
	if len(paragraph) < 40:
	continue
	texts.append(paragraph)
	metadatas.append(
	{
	"file_name": file_name,
	"title": title,
	"url": url,
	"page": page_num,
	"paragraph_index": paragraph_index,
	}
	)

	if not texts:
	raise RuntimeError("No text extracted from PDFs. Check PDF parsing.")

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
	vectorstore.save_local(str(INDEX_DIR))

	# Save a small manifest for debugging & UI
	manifest = {
	"num_texts": len(texts),
	"pdf_dir": str(PDF_DIR),
	"index_dir": str(INDEX_DIR),
	"files_indexed": sorted([p.name for p in PDF_DIR.glob("*.pdf")]),
	"index_version": INDEX_VERSION,
	"manual_exclusions": exclude_map,
	}
	with (INDEX_DIR / "manifest.json").open("w", encoding="utf-8") as f:
	json.dump(manifest, f, indent=2)

	print(f"Index built with {len(texts)} paragraphs. Saved to {INDEX_DIR}.")


	if __name__ == "__main__":
	build_index()