Spaces:

rogeliorichman
/

AI_Agent_Script_Builder

Sleeping

Upload folder using huggingface_hub

92c68e3 verified about 1 year ago

1.82 kB

	import PyPDF2
	from typing import Optional

	class PDFProcessor:
	"""Handles PDF file processing and text extraction"""

	def __init__(self):
	"""Initialize PDF processor"""
	pass

	def extract_text(self, pdf_path: str) -> str:
	"""
	Extract text content from a PDF file

	Args:
	pdf_path: Path to the PDF file

	Returns:
	str: Extracted text content

	Raises:
	FileNotFoundError: If PDF file doesn't exist
	PyPDF2.PdfReadError: If PDF file is invalid or corrupted
	"""
	try:
	with open(pdf_path, 'rb') as file:
	# Create PDF reader object
	reader = PyPDF2.PdfReader(file)

	# Extract text from all pages
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"

	return text.strip()

	except FileNotFoundError:
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")
	except PyPDF2.PdfReadError as e:
	raise PyPDF2.PdfReadError(f"Error reading PDF file: {str(e)}")
	except Exception as e:
	raise Exception(f"Unexpected error processing PDF: {str(e)}")

	def get_metadata(self, pdf_path: str) -> dict:
	"""
	Extract metadata from PDF file

	Args:
	pdf_path: Path to the PDF file

	Returns:
	dict: PDF metadata
	"""
	try:
	with open(pdf_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	return reader.metadata
	except Exception as e:
	return {"error": str(e)}