Spaces:

midlajvalappil
/

AI-Note-Summarizer

Sleeping

App Files Files Community

AI-Note-Summarizer / src /modules /pdf_processor.py

midlajvalappil

Upload 10 files

1ae86a7 verified 8 months ago

raw

history blame contribute delete

5.33 kB

	"""
	PDF Processing Module
	Handles PDF file upload, text extraction, and preprocessing.
	"""

	import PyPDF2
	import io
	import re
	from typing import Optional, List
	import streamlit as st

	class PDFProcessor:
	"""Class to handle PDF file processing and text extraction"""

	def __init__(self):
	self.max_file_size = 10 * 1024 * 1024 # 10MB limit

	def validate_pdf(self, uploaded_file) -> bool:
	"""
	Validate uploaded PDF file

	Args:
	uploaded_file: Streamlit uploaded file object

	Returns:
	bool: True if valid, False otherwise
	"""
	# Check file size
	if uploaded_file.size > self.max_file_size:
	st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)")
	return False

	# Check file type
	if uploaded_file.type != "application/pdf":
	st.error("Please upload a valid PDF file")
	return False

	return True

	def extract_text_from_pdf(self, uploaded_file) -> Optional[str]:
	"""
	Extract text content from uploaded PDF file

	Args:
	uploaded_file: Streamlit uploaded file object

	Returns:
	str: Extracted text content or None if extraction fails
	"""
	try:
	# Reset file pointer
	uploaded_file.seek(0)

	# Create a PDF reader object
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))

	# Check if PDF is encrypted
	if pdf_reader.is_encrypted:
	st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.")
	return None

	# Check number of pages
	num_pages = len(pdf_reader.pages)
	if num_pages == 0:
	st.error("❌ PDF file appears to be empty or corrupted.")
	return None

	if num_pages > 100:
	st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.")

	# Extract text from all pages
	text_content = ""
	failed_pages = []

	for page_num, page in enumerate(pdf_reader.pages):
	try:
	page_text = page.extract_text()
	if page_text.strip(): # Only add non-empty pages
	text_content += page_text + "\n"
	except Exception as e:
	failed_pages.append(page_num + 1)
	continue

	# Report failed pages
	if failed_pages:
	if len(failed_pages) < 5:
	st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}")
	else:
	st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages")

	if not text_content.strip():
	st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.")
	return None

	# Check if extracted text is too short
	if len(text_content.strip()) < 100:
	st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.")

	return text_content

	except PyPDF2.errors.PdfReadError as e:
	st.error(f"❌ Invalid or corrupted PDF file: {str(e)}")
	return None
	except MemoryError:
	st.error("❌ PDF file is too large to process. Please try a smaller file.")
	return None
	except Exception as e:
	st.error(f"❌ Unexpected error processing PDF file: {str(e)}")
	return None

	def preprocess_text(self, text: str) -> str:
	"""
	Clean and preprocess extracted text

	Args:
	text: Raw extracted text

	Returns:
	str: Cleaned and preprocessed text
	"""
	if not text:
	return ""

	# Remove excessive whitespace and newlines
	text = re.sub(r'\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)

	# Remove special characters that might interfere with processing
	text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)

	# Remove extra spaces
	text = ' '.join(text.split())

	return text.strip()

	def process_pdf(self, uploaded_file) -> Optional[str]:
	"""
	Complete PDF processing pipeline

	Args:
	uploaded_file: Streamlit uploaded file object

	Returns:
	str: Processed text content or None if processing fails
	"""
	if not self.validate_pdf(uploaded_file):
	return None

	# Extract text
	raw_text = self.extract_text_from_pdf(uploaded_file)
	if raw_text is None:
	return None

	# Preprocess text
	processed_text = self.preprocess_text(raw_text)

	if len(processed_text) < 50:
	st.warning("The extracted text is very short. Please check if the PDF contains readable text.")

	return processed_text