Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import re | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| import fitz # PyMuPDF | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| ROOT_DIR = Path(__file__).resolve().parent | |
| PDF_DIR = ROOT_DIR / "data" / "pdf" | |
| INDEX_DIR = ROOT_DIR / "data" / "index" | |
| SOURCE_LINKS_PATH = ROOT_DIR / "data" / "source_links.json" | |
| # Increment this when changing ingest logic so apps can trigger rebuilds | |
| INDEX_VERSION = 3 | |
| def load_source_links(path: Path) -> Dict[str, str]: | |
| with path.open("r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def clean_text(text: str) -> str: | |
| # Fix hyphenation and line breaks while preserving paragraph boundaries | |
| # 1) Normalize Windows/Mac line endings | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| # 2) Remove hyphenation at line breaks: "exam-\nple" -> "example" | |
| text = text.replace("-\n", "") | |
| # 3) Collapse single line breaks inside paragraphs into spaces | |
| lines = text.split("\n") | |
| paragraphs: List[str] = [] | |
| current: List[str] = [] | |
| for line in lines: | |
| if line.strip() == "": | |
| if current: | |
| paragraphs.append(" ".join(s.strip() for s in current if s.strip())) | |
| current = [] | |
| else: | |
| current.append(line) | |
| if current: | |
| paragraphs.append(" ".join(s.strip() for s in current if s.strip())) | |
| return "\n\n".join(p.strip() for p in paragraphs if p.strip()) | |
| NOISE_SECTION_KEYWORDS = { | |
| "table of contents", | |
| "contents", | |
| "references", | |
| "bibliography", | |
| "glossary", | |
| "acknowledgements", | |
| "acknowledgments", | |
| "foreword", | |
| "index", | |
| "list of figures", | |
| "list of tables", | |
| } | |
| def looks_like_toc_or_index(text: str) -> bool: | |
| if not text: | |
| return False | |
| # Many lines with dot leaders followed by page numbers | |
| matches = re.findall(r"\.{2,}\s*\d{1,3}\b", text) | |
| return len(matches) >= 5 | |
| def is_noise_page(raw_text: str, page_number: int) -> bool: | |
| t = (raw_text or "").lower() | |
| # Drop first page globally as requested | |
| if page_number == 1: | |
| return True | |
| if any(kw in t for kw in NOISE_SECTION_KEYWORDS): | |
| return True | |
| if looks_like_toc_or_index(raw_text): | |
| return True | |
| return False | |
| def extract_paragraphs_with_pages(pdf_path: Path) -> List[Tuple[int, List[str]]]: | |
| doc = fitz.open(pdf_path) | |
| results: List[Tuple[int, List[str]]] = [] | |
| for page_number in range(len(doc)): | |
| page = doc.load_page(page_number) | |
| raw_text = page.get_text("text") or "" | |
| # Skip pages that are likely ToC, Index, References, Glossary, or boilerplate | |
| if is_noise_page(raw_text, page_number + 1): | |
| continue | |
| cleaned = clean_text(raw_text) | |
| # Split paragraphs on double newlines created in clean_text | |
| paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()] | |
| results.append((page_number + 1, paragraphs)) | |
| return results | |
| def filename_to_title(file_name: str) -> str: | |
| name = file_name.rsplit(".", 1)[0] | |
| return name.replace("_", " ").replace("-", " ") | |
| def load_exclude_pages(path: Path) -> Dict[str, List[int]]: | |
| """Optional per-file manual page exclusions. | |
| JSON format: {"Some.pdf": [1,2,3], "Other.pdf": [10,11]} | |
| """ | |
| if not path.exists(): | |
| return {} | |
| try: | |
| with path.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # Normalize keys to file names only | |
| norm: Dict[str, List[int]] = {} | |
| for k, v in (data or {}).items(): | |
| try: | |
| fname = Path(k).name | |
| nums = [int(x) for x in (v or [])] | |
| norm[fname] = nums | |
| except Exception: | |
| continue | |
| return norm | |
| except Exception: | |
| return {} | |
| def build_index(): | |
| if not PDF_DIR.exists(): | |
| raise FileNotFoundError(f"PDF directory not found: {PDF_DIR}") | |
| INDEX_DIR.mkdir(parents=True, exist_ok=True) | |
| source_links = load_source_links(SOURCE_LINKS_PATH) | |
| exclude_map = load_exclude_pages(ROOT_DIR / "data" / "exclude_pages.json") | |
| texts: List[str] = [] | |
| metadatas: List[Dict] = [] | |
| for pdf_file in sorted(PDF_DIR.glob("*.pdf")): | |
| file_name = pdf_file.name | |
| url = source_links.get(file_name, "") | |
| title = filename_to_title(file_name) | |
| para_pages = extract_paragraphs_with_pages(pdf_file) | |
| manual_excludes = set(exclude_map.get(file_name, [])) | |
| for page_num, paragraphs in para_pages: | |
| if page_num in manual_excludes: | |
| continue | |
| for paragraph_index, paragraph in enumerate(paragraphs): | |
| # Skip tiny fragments | |
| if len(paragraph) < 40: | |
| continue | |
| texts.append(paragraph) | |
| metadatas.append( | |
| { | |
| "file_name": file_name, | |
| "title": title, | |
| "url": url, | |
| "page": page_num, | |
| "paragraph_index": paragraph_index, | |
| } | |
| ) | |
| if not texts: | |
| raise RuntimeError("No text extracted from PDFs. Check PDF parsing.") | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| vectorstore = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas) | |
| vectorstore.save_local(str(INDEX_DIR)) | |
| # Save a small manifest for debugging & UI | |
| manifest = { | |
| "num_texts": len(texts), | |
| "pdf_dir": str(PDF_DIR), | |
| "index_dir": str(INDEX_DIR), | |
| "files_indexed": sorted([p.name for p in PDF_DIR.glob("*.pdf")]), | |
| "index_version": INDEX_VERSION, | |
| "manual_exclusions": exclude_map, | |
| } | |
| with (INDEX_DIR / "manifest.json").open("w", encoding="utf-8") as f: | |
| json.dump(manifest, f, indent=2) | |
| print(f"Index built with {len(texts)} paragraphs. Saved to {INDEX_DIR}.") | |
| if __name__ == "__main__": | |
| build_index() | |