File size: 10,201 Bytes
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f37f939
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import os
import sys
from pathlib import Path
from typing import Optional, Union
import logging

# Import document parsing libraries
try:
    import PyPDF2
    from docx import Document
    import ebooklib
    from ebooklib import epub
    from bs4 import BeautifulSoup
except ImportError as e:
    print(f"Missing required dependency: {e}")
    print("Please install dependencies with: pip install -r requirements.txt")
    sys.exit(1)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DocumentParser:
    """
    A class to parse and extract text from various document formats.
    Supports PDF, TXT, DOC, DOCX, and EPUB files.
    """
    
    def __init__(self):
        self.supported_formats = {
            'application/pdf': self._parse_pdf,
            'text/plain': self._parse_txt,
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx,
            'application/msword': self._parse_doc,
            'application/epub+zip': self._parse_epub
        }
    
    def get_file_type(self, file_path: Union[str, Path]) -> str:
        """
        Detect the MIME type of a file using file extension.
        
        Args:
            file_path: Path to the file
            
        Returns:
            MIME type string
        """
        return self._get_mime_from_extension(file_path)
    
    def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str:
        """
        Determine MIME type from file extension.
        
        Args:
            file_path: Path to the file
            
        Returns:
            MIME type string
        """
        extension = Path(file_path).suffix.lower()
        extension_map = {
            '.pdf': 'application/pdf',
            '.txt': 'text/plain',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.doc': 'application/msword',
            '.epub': 'application/epub+zip'
        }
        
        mime_type = extension_map.get(extension, 'unknown')
        
        # If no extension or unknown extension, try to detect by content
        if mime_type == 'unknown':
            mime_type = self._detect_mime_by_content(file_path)
            
        return mime_type
    
    def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str:
        """
        Detect MIME type by reading file content.
        
        Args:
            file_path: Path to the file
            
        Returns:
            MIME type string
        """
        try:
            with open(file_path, 'rb') as f:
                # Read first 1024 bytes to detect file type
                header = f.read(1024)
                
                # PDF detection
                if header.startswith(b'%PDF'):
                    return 'application/pdf'
                
                # ZIP-based formats (DOCX, EPUB)
                if header.startswith(b'PK\x03\x04'):
                    # Check if it's EPUB by looking for mimetype file
                    try:
                        import zipfile
                        with zipfile.ZipFile(file_path, 'r') as zf:
                            if 'mimetype' in zf.namelist():
                                with zf.open('mimetype') as mf:
                                    mimetype = mf.read().decode('utf-8').strip()
                                    if mimetype == 'application/epub+zip':
                                        return 'application/epub+zip'
                        # If not EPUB, assume DOCX
                        return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                    except:
                        pass
                
                # Plain text detection (try to decode as UTF-8)
                try:
                    header.decode('utf-8')
                    return 'text/plain'
                except UnicodeDecodeError:
                    pass
                
        except Exception as e:
            logger.warning(f"Error detecting MIME type by content: {e}")
        
        return 'unknown'
    
    def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
        """
        Extract text from a document file.
        
        Args:
            file_path: Path to the document file
            
        Returns:
            Extracted text as string, or None if extraction fails
        """
        file_path = Path(file_path)
        
        if not file_path.exists():
            logger.error(f"File not found: {file_path}")
            return None
        
        try:
            mime_type = self.get_file_type(file_path)
            logger.info(f"Detected file type: {mime_type}")
            
            if mime_type in self.supported_formats:
                return self.supported_formats[mime_type](file_path)
            else:
                logger.error(f"Unsupported file type: {mime_type}")
                return None
                
        except Exception as e:
            logger.error(f"Error extracting text from {file_path}: {e}")
            return None
    
    def _parse_pdf(self, file_path: Path) -> str:
        """
        Extract text from PDF file.
        
        Args:
            file_path: Path to PDF file
            
        Returns:
            Extracted text
        """
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                        
        except Exception as e:
            logger.error(f"Error parsing PDF {file_path}: {e}")
            raise
            
        return text.strip()
    
    def _parse_txt(self, file_path: Path) -> str:
        """
        Extract text from plain text file.
        
        Args:
            file_path: Path to text file
            
        Returns:
            Extracted text
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            # Try with different encoding
            try:
                with open(file_path, 'r', encoding='latin-1') as file:
                    return file.read()
            except Exception as e:
                logger.error(f"Error reading text file {file_path}: {e}")
                raise
        except Exception as e:
            logger.error(f"Error reading text file {file_path}: {e}")
            raise
    
    def _parse_docx(self, file_path: Path) -> str:
        """
        Extract text from DOCX file.
        
        Args:
            file_path: Path to DOCX file
            
        Returns:
            Extracted text
        """
        try:
            doc = Document(file_path)
            text = ""
            
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
                
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error parsing DOCX {file_path}: {e}")
            raise
    
    def _parse_doc(self, file_path: Path) -> str:
        """
        Extract text from DOC file (legacy Word format).
        Note: This requires additional dependencies like antiword or catdoc.
        
        Args:
            file_path: Path to DOC file
            
        Returns:
            Extracted text
        """
        try:
            # Try using antiword if available
            import subprocess
            result = subprocess.run(['antiword', str(file_path)], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
            
            # Fallback: try catdoc
            result = subprocess.run(['catdoc', str(file_path)], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
                
            raise Exception("Neither antiword nor catdoc found. Please install one of them.")
            
        except FileNotFoundError:
            raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.")
        except Exception as e:
            logger.error(f"Error parsing DOC {file_path}: {e}")
            raise
    
    def _parse_epub(self, file_path: Path) -> str:
        """
        Extract text from EPUB file.
        
        Args:
            file_path: Path to EPUB file
            
        Returns:
            Extracted text
        """
        try:
            book = epub.read_epub(file_path)
            text = ""
            
            for item in book.get_items():
                if item.get_type() == ebooklib.ITEM_DOCUMENT:
                    content = item.get_content().decode('utf-8')
                    soup = BeautifulSoup(content, 'html.parser')
                    text += soup.get_text() + "\n"
                    
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error parsing EPUB {file_path}: {e}")
            raise


def main():
    """
    Main function to demonstrate usage of the DocumentParser.
    """
    if len(sys.argv) != 2:
        print("Usage: python document_parsing.py <file_path>")
        print("Supported formats: PDF, TXT, DOC, DOCX, EPUB")
        sys.exit(1)
    
    file_path = sys.argv[1]
    parser = DocumentParser()
    
    print(f"Extracting text from: {file_path}")
    print("-" * 50)
    
    extracted_text = parser.extract_text(file_path)
    
    if extracted_text:
        print("Extracted text:")
        print(extracted_text)
        print(f"\nTotal characters: {len(extracted_text)}")
    else:
        print("Failed to extract text from the file.")
        sys.exit(1)


if __name__ == "__main__":
    main()