|
import os |
|
import sys |
|
from pathlib import Path |
|
from typing import Optional, Union |
|
import logging |
|
|
|
|
|
try: |
|
import PyPDF2 |
|
from docx import Document |
|
import ebooklib |
|
from ebooklib import epub |
|
from bs4 import BeautifulSoup |
|
except ImportError as e: |
|
print(f"Missing required dependency: {e}") |
|
print("Please install dependencies with: pip install -r requirements.txt") |
|
sys.exit(1) |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class DocumentParser: |
|
""" |
|
A class to parse and extract text from various document formats. |
|
Supports PDF, TXT, DOC, DOCX, and EPUB files. |
|
""" |
|
|
|
def __init__(self): |
|
self.supported_formats = { |
|
'application/pdf': self._parse_pdf, |
|
'text/plain': self._parse_txt, |
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx, |
|
'application/msword': self._parse_doc, |
|
'application/epub+zip': self._parse_epub |
|
} |
|
|
|
def get_file_type(self, file_path: Union[str, Path]) -> str: |
|
""" |
|
Detect the MIME type of a file using file extension. |
|
|
|
Args: |
|
file_path: Path to the file |
|
|
|
Returns: |
|
MIME type string |
|
""" |
|
return self._get_mime_from_extension(file_path) |
|
|
|
def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str: |
|
""" |
|
Determine MIME type from file extension. |
|
|
|
Args: |
|
file_path: Path to the file |
|
|
|
Returns: |
|
MIME type string |
|
""" |
|
extension = Path(file_path).suffix.lower() |
|
extension_map = { |
|
'.pdf': 'application/pdf', |
|
'.txt': 'text/plain', |
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'.doc': 'application/msword', |
|
'.epub': 'application/epub+zip' |
|
} |
|
|
|
mime_type = extension_map.get(extension, 'unknown') |
|
|
|
|
|
if mime_type == 'unknown': |
|
mime_type = self._detect_mime_by_content(file_path) |
|
|
|
return mime_type |
|
|
|
def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str: |
|
""" |
|
Detect MIME type by reading file content. |
|
|
|
Args: |
|
file_path: Path to the file |
|
|
|
Returns: |
|
MIME type string |
|
""" |
|
try: |
|
with open(file_path, 'rb') as f: |
|
|
|
header = f.read(1024) |
|
|
|
|
|
if header.startswith(b'%PDF'): |
|
return 'application/pdf' |
|
|
|
|
|
if header.startswith(b'PK\x03\x04'): |
|
|
|
try: |
|
import zipfile |
|
with zipfile.ZipFile(file_path, 'r') as zf: |
|
if 'mimetype' in zf.namelist(): |
|
with zf.open('mimetype') as mf: |
|
mimetype = mf.read().decode('utf-8').strip() |
|
if mimetype == 'application/epub+zip': |
|
return 'application/epub+zip' |
|
|
|
return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' |
|
except: |
|
pass |
|
|
|
|
|
try: |
|
header.decode('utf-8') |
|
return 'text/plain' |
|
except UnicodeDecodeError: |
|
pass |
|
|
|
except Exception as e: |
|
logger.warning(f"Error detecting MIME type by content: {e}") |
|
|
|
return 'unknown' |
|
|
|
def extract_text(self, file_path: Union[str, Path]) -> Optional[str]: |
|
""" |
|
Extract text from a document file. |
|
|
|
Args: |
|
file_path: Path to the document file |
|
|
|
Returns: |
|
Extracted text as string, or None if extraction fails |
|
""" |
|
file_path = Path(file_path) |
|
|
|
if not file_path.exists(): |
|
logger.error(f"File not found: {file_path}") |
|
return None |
|
|
|
try: |
|
mime_type = self.get_file_type(file_path) |
|
logger.info(f"Detected file type: {mime_type}") |
|
|
|
if mime_type in self.supported_formats: |
|
return self.supported_formats[mime_type](file_path) |
|
else: |
|
logger.error(f"Unsupported file type: {mime_type}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error extracting text from {file_path}: {e}") |
|
return None |
|
|
|
def _parse_pdf(self, file_path: Path) -> str: |
|
""" |
|
Extract text from PDF file. |
|
|
|
Args: |
|
file_path: Path to PDF file |
|
|
|
Returns: |
|
Extracted text |
|
""" |
|
text = "" |
|
try: |
|
with open(file_path, 'rb') as file: |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
|
|
for page_num in range(len(pdf_reader.pages)): |
|
page = pdf_reader.pages[page_num] |
|
page_text = page.extract_text() |
|
if page_text: |
|
text += page_text + "\n" |
|
|
|
except Exception as e: |
|
logger.error(f"Error parsing PDF {file_path}: {e}") |
|
raise |
|
|
|
return text.strip() |
|
|
|
def _parse_txt(self, file_path: Path) -> str: |
|
""" |
|
Extract text from plain text file. |
|
|
|
Args: |
|
file_path: Path to text file |
|
|
|
Returns: |
|
Extracted text |
|
""" |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
except UnicodeDecodeError: |
|
|
|
try: |
|
with open(file_path, 'r', encoding='latin-1') as file: |
|
return file.read() |
|
except Exception as e: |
|
logger.error(f"Error reading text file {file_path}: {e}") |
|
raise |
|
except Exception as e: |
|
logger.error(f"Error reading text file {file_path}: {e}") |
|
raise |
|
|
|
def _parse_docx(self, file_path: Path) -> str: |
|
""" |
|
Extract text from DOCX file. |
|
|
|
Args: |
|
file_path: Path to DOCX file |
|
|
|
Returns: |
|
Extracted text |
|
""" |
|
try: |
|
doc = Document(file_path) |
|
text = "" |
|
|
|
for paragraph in doc.paragraphs: |
|
text += paragraph.text + "\n" |
|
|
|
return text.strip() |
|
|
|
except Exception as e: |
|
logger.error(f"Error parsing DOCX {file_path}: {e}") |
|
raise |
|
|
|
def _parse_doc(self, file_path: Path) -> str: |
|
""" |
|
Extract text from DOC file (legacy Word format). |
|
Note: This requires additional dependencies like antiword or catdoc. |
|
|
|
Args: |
|
file_path: Path to DOC file |
|
|
|
Returns: |
|
Extracted text |
|
""" |
|
try: |
|
|
|
import subprocess |
|
result = subprocess.run(['antiword', str(file_path)], |
|
capture_output=True, text=True) |
|
if result.returncode == 0: |
|
return result.stdout.strip() |
|
|
|
|
|
result = subprocess.run(['catdoc', str(file_path)], |
|
capture_output=True, text=True) |
|
if result.returncode == 0: |
|
return result.stdout.strip() |
|
|
|
raise Exception("Neither antiword nor catdoc found. Please install one of them.") |
|
|
|
except FileNotFoundError: |
|
raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.") |
|
except Exception as e: |
|
logger.error(f"Error parsing DOC {file_path}: {e}") |
|
raise |
|
|
|
def _parse_epub(self, file_path: Path) -> str: |
|
""" |
|
Extract text from EPUB file. |
|
|
|
Args: |
|
file_path: Path to EPUB file |
|
|
|
Returns: |
|
Extracted text |
|
""" |
|
try: |
|
book = epub.read_epub(file_path) |
|
text = "" |
|
|
|
for item in book.get_items(): |
|
if item.get_type() == ebooklib.ITEM_DOCUMENT: |
|
content = item.get_content().decode('utf-8') |
|
soup = BeautifulSoup(content, 'html.parser') |
|
text += soup.get_text() + "\n" |
|
|
|
return text.strip() |
|
|
|
except Exception as e: |
|
logger.error(f"Error parsing EPUB {file_path}: {e}") |
|
raise |
|
|
|
|
|
def main(): |
|
""" |
|
Main function to demonstrate usage of the DocumentParser. |
|
""" |
|
if len(sys.argv) != 2: |
|
print("Usage: python document_parsing.py <file_path>") |
|
print("Supported formats: PDF, TXT, DOC, DOCX, EPUB") |
|
sys.exit(1) |
|
|
|
file_path = sys.argv[1] |
|
parser = DocumentParser() |
|
|
|
print(f"Extracting text from: {file_path}") |
|
print("-" * 50) |
|
|
|
extracted_text = parser.extract_text(file_path) |
|
|
|
if extracted_text: |
|
print("Extracted text:") |
|
print(extracted_text) |
|
print(f"\nTotal characters: {len(extracted_text)}") |
|
else: |
|
print("Failed to extract text from the file.") |
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|