Spaces:

deenasun
/

ai-sl-api

Running

File size: 10,201 Bytes

import os
import sys
from pathlib import Path
from typing import Optional, Union
import logging

# Import document parsing libraries
try:
    import PyPDF2
    from docx import Document
    import ebooklib
    from ebooklib import epub
    from bs4 import BeautifulSoup
except ImportError as e:
    print(f"Missing required dependency: {e}")
    print("Please install dependencies with: pip install -r requirements.txt")
    sys.exit(1)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DocumentParser:
    """
    A class to parse and extract text from various document formats.
    Supports PDF, TXT, DOC, DOCX, and EPUB files.
    """
    
    def __init__(self):
        self.supported_formats = {
            'application/pdf': self._parse_pdf,
            'text/plain': self._parse_txt,
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx,
            'application/msword': self._parse_doc,
            'application/epub+zip': self._parse_epub
        }
    
    def get_file_type(self, file_path: Union[str, Path]) -> str:
        """
        Detect the MIME type of a file using file extension.
        
        Args:
            file_path: Path to the file
            
        Returns:
            MIME type string
        """
        return self._get_mime_from_extension(file_path)
    
    def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str:
        """
        Determine MIME type from file extension.
        
        Args:
            file_path: Path to the file
            
        Returns:
            MIME type string
        """
        extension = Path(file_path).suffix.lower()
        extension_map = {
            '.pdf': 'application/pdf',
            '.txt': 'text/plain',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.doc': 'application/msword',
            '.epub': 'application/epub+zip'
        }
        
        mime_type = extension_map.get(extension, 'unknown')
        
        # If no extension or unknown extension, try to detect by content
        if mime_type == 'unknown':
            mime_type = self._detect_mime_by_content(file_path)
            
        return mime_type
    
    def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str:
        """
        Detect MIME type by reading file content.
        
        Args:
            file_path: Path to the file
            
        Returns:
            MIME type string
        """
        try:
            with open(file_path, 'rb') as f:
                # Read first 1024 bytes to detect file type
                header = f.read(1024)
                
                # PDF detection
                if header.startswith(b'%PDF'):
                    return 'application/pdf'
                
                # ZIP-based formats (DOCX, EPUB)
                if header.startswith(b'PK\x03\x04'):
                    # Check if it's EPUB by looking for mimetype file
                    try:
                        import zipfile
                        with zipfile.ZipFile(file_path, 'r') as zf:
                            if 'mimetype' in zf.namelist():
                                with zf.open('mimetype') as mf:
                                    mimetype = mf.read().decode('utf-8').strip()
                                    if mimetype == 'application/epub+zip':
                                        return 'application/epub+zip'
                        # If not EPUB, assume DOCX
                        return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                    except:
                        pass
                
                # Plain text detection (try to decode as UTF-8)
                try:
                    header.decode('utf-8')
                    return 'text/plain'
                except UnicodeDecodeError:
                    pass
                
        except Exception as e:
            logger.warning(f"Error detecting MIME type by content: {e}")
        
        return 'unknown'
    
    def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
        """
        Extract text from a document file.
        
        Args:
            file_path: Path to the document file
            
        Returns:
            Extracted text as string, or None if extraction fails
        """
        file_path = Path(file_path)
        
        if not file_path.exists():
            logger.error(f"File not found: {file_path}")
            return None
        
        try:
            mime_type = self.get_file_type(file_path)
            logger.info(f"Detected file type: {mime_type}")
            
            if mime_type in self.supported_formats:
                return self.supported_formats[mime_type](file_path)
            else:
                logger.error(f"Unsupported file type: {mime_type}")
                return None
                
        except Exception as e:
            logger.error(f"Error extracting text from {file_path}: {e}")
            return None
    
    def _parse_pdf(self, file_path: Path) -> str:
        """
        Extract text from PDF file.
        
        Args:
            file_path: Path to PDF file
            
        Returns:
            Extracted text
        """
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                        
        except Exception as e:
            logger.error(f"Error parsing PDF {file_path}: {e}")
            raise
            
        return text.strip()
    
    def _parse_txt(self, file_path: Path) -> str:
        """
        Extract text from plain text file.
        
        Args:
            file_path: Path to text file
            
        Returns:
            Extracted text
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            # Try with different encoding
            try:
                with open(file_path, 'r', encoding='latin-1') as file:
                    return file.read()
            except Exception as e:
                logger.error(f"Error reading text file {file_path}: {e}")
                raise
        except Exception as e:
            logger.error(f"Error reading text file {file_path}: {e}")
            raise
    
    def _parse_docx(self, file_path: Path) -> str:
        """
        Extract text from DOCX file.
        
        Args:
            file_path: Path to DOCX file
            
        Returns:
            Extracted text
        """
        try:
            doc = Document(file_path)
            text = ""
            
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
                
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error parsing DOCX {file_path}: {e}")
            raise
    
    def _parse_doc(self, file_path: Path) -> str:
        """
        Extract text from DOC file (legacy Word format).
        Note: This requires additional dependencies like antiword or catdoc.
        
        Args:
            file_path: Path to DOC file
            
        Returns:
            Extracted text
        """
        try:
            # Try using antiword if available
            import subprocess
            result = subprocess.run(['antiword', str(file_path)], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
            
            # Fallback: try catdoc
            result = subprocess.run(['catdoc', str(file_path)], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
                
            raise Exception("Neither antiword nor catdoc found. Please install one of them.")
            
        except FileNotFoundError:
            raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.")
        except Exception as e:
            logger.error(f"Error parsing DOC {file_path}: {e}")
            raise
    
    def _parse_epub(self, file_path: Path) -> str:
        """
        Extract text from EPUB file.
        
        Args:
            file_path: Path to EPUB file
            
        Returns:
            Extracted text
        """
        try:
            book = epub.read_epub(file_path)
            text = ""
            
            for item in book.get_items():
                if item.get_type() == ebooklib.ITEM_DOCUMENT:
                    content = item.get_content().decode('utf-8')
                    soup = BeautifulSoup(content, 'html.parser')
                    text += soup.get_text() + "\n"
                    
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error parsing EPUB {file_path}: {e}")
            raise


def main():
    """
    Main function to demonstrate usage of the DocumentParser.
    """
    if len(sys.argv) != 2:
        print("Usage: python document_parsing.py <file_path>")
        print("Supported formats: PDF, TXT, DOC, DOCX, EPUB")
        sys.exit(1)
    
    file_path = sys.argv[1]
    parser = DocumentParser()
    
    print(f"Extracting text from: {file_path}")
    print("-" * 50)
    
    extracted_text = parser.extract_text(file_path)
    
    if extracted_text:
        print("Extracted text:")
        print(extracted_text)
        print(f"\nTotal characters: {len(extracted_text)}")
    else:
        print("Failed to extract text from the file.")
        sys.exit(1)


if __name__ == "__main__":
    main()