Spaces:
Sleeping
Sleeping
| import re | |
| from langchain_community.document_loaders import PyPDFLoader | |
| class BookTitleExtractor: | |
| def __init__(self, llm=None): | |
| self.llm = llm | |
| def extract_title(self, pdf_path, max_pages=5): | |
| title = self._extract_with_heuristics(pdf_path, max_pages) | |
| if title: | |
| return title | |
| if self.llm: | |
| return self._extract_with_llm(pdf_path) | |
| return "Unknown Title" | |
| def _extract_with_heuristics(self, pdf_path, max_pages): | |
| loader = PyPDFLoader(pdf_path) | |
| pages = loader.load()[:max_pages] | |
| for page in pages: | |
| text = page.page_content.strip() | |
| if not text: | |
| continue | |
| # Heuristic 1: ALL CAPS title | |
| matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE) | |
| if matches: | |
| return matches[0].strip() | |
| # Heuristic 2: First significant line | |
| lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10] | |
| if lines: | |
| return lines[0] | |
| return None | |
| def extract_book_title_from_documents(self,documents, max_docs=5): | |
| for doc in documents[:max_docs]: | |
| text = doc.page_content.strip() | |
| if not text: | |
| continue | |
| # Heuristic 1: Lines with ALL CAPS (title pages often use this) | |
| matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE) | |
| if matches: | |
| return matches[0].strip() | |
| # Heuristic 2: First non-empty, title-cased line | |
| for line in text.split("\n"): | |
| line = line.strip() | |
| if len(line) > 10 and line.istitle(): | |
| return line | |
| return "Unknown Title" | |
| def _extract_with_llm(self, pdf_path): | |
| loader = PyPDFLoader(pdf_path) | |
| pages = loader.load() | |
| if not pages: | |
| return "Unknown Title" | |
| sample_text = pages[0].page_content.strip()[:1000] | |
| prompt = ( | |
| "Identify the book title from the following text:\n\n" | |
| f"{sample_text}\n\nOnly return the book title." | |
| ) | |
| return self.llm.invoke(prompt).strip() | |