Spaces:
Sleeping
Sleeping
""" | |
Text Extractor Module | |
Handles extracting text content from PDF files. | |
""" | |
import pdfplumber | |
class TextExtractor: | |
"""Handles text extraction from PDF files.""" | |
def __init__(self): | |
"""Initialize the text extractor.""" | |
pass | |
async def extract_text_from_pdf(self, pdf_path: str) -> str: | |
""" | |
Extract text from PDF file. | |
Args: | |
pdf_path: Path to the PDF file | |
Returns: | |
str: Extracted text content | |
Raises: | |
Exception: If text extraction fails | |
""" | |
print(f"π Extracting text from PDF...") | |
full_text = "" | |
try: | |
with pdfplumber.open(pdf_path) as pdf: | |
for page_num, page in enumerate(pdf.pages): | |
text = page.extract_text() | |
if text: | |
full_text += f"\n--- Page {page_num + 1} ---\n" | |
full_text += text | |
print(f"β Extracted {len(full_text)} characters from PDF") | |
return full_text | |
except Exception as e: | |
raise Exception(f"Failed to extract text from PDF: {str(e)}") | |
def validate_extracted_text(self, text: str) -> bool: | |
""" | |
Validate that extracted text is not empty and contains meaningful content. | |
Args: | |
text: The extracted text to validate | |
Returns: | |
bool: True if text is valid, False otherwise | |
""" | |
if not text or not text.strip(): | |
return False | |
# Check if text has at least some alphabetic characters | |
alphabetic_chars = sum(1 for char in text if char.isalpha()) | |
return alphabetic_chars > 50 # At least 50 alphabetic characters | |