Spaces:
Sleeping
Sleeping
File size: 1,904 Bytes
5ff6b14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
"""
Text Extractor Module
Handles extracting text content from PDF files.
"""
import pdfplumber
class TextExtractor:
"""Handles text extraction from PDF files."""
def __init__(self):
"""Initialize the text extractor."""
pass
async def extract_text_from_pdf(self, pdf_path: str) -> str:
"""
Extract text from PDF file.
Args:
pdf_path: Path to the PDF file
Returns:
str: Extracted text content
Raises:
Exception: If text extraction fails
"""
print(f"📖 Extracting text from PDF...")
full_text = ""
try:
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
text = page.extract_text()
if text:
full_text += f"\n--- Page {page_num + 1} ---\n"
full_text += text
print(f"✅ Extracted {len(full_text)} characters from PDF")
return full_text
except Exception as e:
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def validate_extracted_text(self, text: str) -> bool:
"""
Validate that extracted text is not empty and contains meaningful content.
Args:
text: The extracted text to validate
Returns:
bool: True if text is valid, False otherwise
"""
if not text or not text.strip():
return False
# Check if text has at least some alphabetic characters
alphabetic_chars = sum(1 for char in text if char.isalpha())
return alphabetic_chars > 50 # At least 50 alphabetic characters
|