import os import sys from pathlib import Path from typing import Optional, Union import logging # Import document parsing libraries try: import PyPDF2 from docx import Document import ebooklib from ebooklib import epub from bs4 import BeautifulSoup except ImportError as e: print(f"Missing required dependency: {e}") print("Please install dependencies with: pip install -r requirements.txt") sys.exit(1) # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DocumentParser: """ A class to parse and extract text from various document formats. Supports PDF, TXT, DOC, DOCX, and EPUB files. """ def __init__(self): self.supported_formats = { 'application/pdf': self._parse_pdf, 'text/plain': self._parse_txt, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx, 'application/msword': self._parse_doc, 'application/epub+zip': self._parse_epub } def get_file_type(self, file_path: Union[str, Path]) -> str: """ Detect the MIME type of a file using file extension. Args: file_path: Path to the file Returns: MIME type string """ return self._get_mime_from_extension(file_path) def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str: """ Determine MIME type from file extension. Args: file_path: Path to the file Returns: MIME type string """ extension = Path(file_path).suffix.lower() extension_map = { '.pdf': 'application/pdf', '.txt': 'text/plain', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.doc': 'application/msword', '.epub': 'application/epub+zip' } mime_type = extension_map.get(extension, 'unknown') # If no extension or unknown extension, try to detect by content if mime_type == 'unknown': mime_type = self._detect_mime_by_content(file_path) return mime_type def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str: """ Detect MIME type by reading file content. Args: file_path: Path to the file Returns: MIME type string """ try: with open(file_path, 'rb') as f: # Read first 1024 bytes to detect file type header = f.read(1024) # PDF detection if header.startswith(b'%PDF'): return 'application/pdf' # ZIP-based formats (DOCX, EPUB) if header.startswith(b'PK\x03\x04'): # Check if it's EPUB by looking for mimetype file try: import zipfile with zipfile.ZipFile(file_path, 'r') as zf: if 'mimetype' in zf.namelist(): with zf.open('mimetype') as mf: mimetype = mf.read().decode('utf-8').strip() if mimetype == 'application/epub+zip': return 'application/epub+zip' # If not EPUB, assume DOCX return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' except: pass # Plain text detection (try to decode as UTF-8) try: header.decode('utf-8') return 'text/plain' except UnicodeDecodeError: pass except Exception as e: logger.warning(f"Error detecting MIME type by content: {e}") return 'unknown' def extract_text(self, file_path: Union[str, Path]) -> Optional[str]: """ Extract text from a document file. Args: file_path: Path to the document file Returns: Extracted text as string, or None if extraction fails """ file_path = Path(file_path) if not file_path.exists(): logger.error(f"File not found: {file_path}") return None try: mime_type = self.get_file_type(file_path) logger.info(f"Detected file type: {mime_type}") if mime_type in self.supported_formats: return self.supported_formats[mime_type](file_path) else: logger.error(f"Unsupported file type: {mime_type}") return None except Exception as e: logger.error(f"Error extracting text from {file_path}: {e}") return None def _parse_pdf(self, file_path: Path) -> str: """ Extract text from PDF file. Args: file_path: Path to PDF file Returns: Extracted text """ text = "" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception as e: logger.error(f"Error parsing PDF {file_path}: {e}") raise return text.strip() def _parse_txt(self, file_path: Path) -> str: """ Extract text from plain text file. Args: file_path: Path to text file Returns: Extracted text """ try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except UnicodeDecodeError: # Try with different encoding try: with open(file_path, 'r', encoding='latin-1') as file: return file.read() except Exception as e: logger.error(f"Error reading text file {file_path}: {e}") raise except Exception as e: logger.error(f"Error reading text file {file_path}: {e}") raise def _parse_docx(self, file_path: Path) -> str: """ Extract text from DOCX file. Args: file_path: Path to DOCX file Returns: Extracted text """ try: doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text.strip() except Exception as e: logger.error(f"Error parsing DOCX {file_path}: {e}") raise def _parse_doc(self, file_path: Path) -> str: """ Extract text from DOC file (legacy Word format). Note: This requires additional dependencies like antiword or catdoc. Args: file_path: Path to DOC file Returns: Extracted text """ try: # Try using antiword if available import subprocess result = subprocess.run(['antiword', str(file_path)], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() # Fallback: try catdoc result = subprocess.run(['catdoc', str(file_path)], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() raise Exception("Neither antiword nor catdoc found. Please install one of them.") except FileNotFoundError: raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.") except Exception as e: logger.error(f"Error parsing DOC {file_path}: {e}") raise def _parse_epub(self, file_path: Path) -> str: """ Extract text from EPUB file. Args: file_path: Path to EPUB file Returns: Extracted text """ try: book = epub.read_epub(file_path) text = "" for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: content = item.get_content().decode('utf-8') soup = BeautifulSoup(content, 'html.parser') text += soup.get_text() + "\n" return text.strip() except Exception as e: logger.error(f"Error parsing EPUB {file_path}: {e}") raise def main(): """ Main function to demonstrate usage of the DocumentParser. """ if len(sys.argv) != 2: print("Usage: python document_parsing.py ") print("Supported formats: PDF, TXT, DOC, DOCX, EPUB") sys.exit(1) file_path = sys.argv[1] parser = DocumentParser() print(f"Extracting text from: {file_path}") print("-" * 50) extracted_text = parser.extract_text(file_path) if extracted_text: print("Extracted text:") print(extracted_text) print(f"\nTotal characters: {len(extracted_text)}") else: print("Failed to extract text from the file.") sys.exit(1) if __name__ == "__main__": main()