Spaces:

hoshoo21
/

Custom_RAG

Sleeping

Custom_RAG / book_title_extractor.py

hoshoo21

deployment

7a837d4 4 months ago

2.21 kB

	import re
	from langchain_community.document_loaders import PyPDFLoader

	class BookTitleExtractor:
	def __init__(self, llm=None):
	self.llm = llm

	def extract_title(self, pdf_path, max_pages=5):
	title = self._extract_with_heuristics(pdf_path, max_pages)
	if title:
	return title
	if self.llm:
	return self._extract_with_llm(pdf_path)
	return "Unknown Title"

	def _extract_with_heuristics(self, pdf_path, max_pages):
	loader = PyPDFLoader(pdf_path)
	pages = loader.load()[:max_pages]

	for page in pages:
	text = page.page_content.strip()
	if not text:
	continue
	# Heuristic 1: ALL CAPS title
	matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
	if matches:
	return matches[0].strip()
	# Heuristic 2: First significant line
	lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
	if lines:
	return lines[0]
	return None

	def extract_book_title_from_documents(self,documents, max_docs=5):

	for doc in documents[:max_docs]:
	text = doc.page_content.strip()
	if not text:
	continue

	# Heuristic 1: Lines with ALL CAPS (title pages often use this)
	matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
	if matches:
	return matches[0].strip()

	# Heuristic 2: First non-empty, title-cased line
	for line in text.split("\n"):
	line = line.strip()
	if len(line) > 10 and line.istitle():
	return line
	return "Unknown Title"

	def _extract_with_llm(self, pdf_path):
	loader = PyPDFLoader(pdf_path)
	pages = loader.load()
	if not pages:
	return "Unknown Title"
	sample_text = pages[0].page_content.strip()[:1000]
	prompt = (
	"Identify the book title from the following text:\n\n"
	f"{sample_text}\n\nOnly return the book title."
	)
	return self.llm.invoke(prompt).strip()