# file_processor/processor.py import PyPDF2 from docx import Document import magic import os class FileProcessor: def process_file(self, filepath): file_type = magic.from_file(filepath, mime=True) if file_type == 'text/plain': return self._process_txt(filepath) elif file_type == 'application/pdf': return self._process_pdf(filepath) elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': return self._process_docx(filepath) else: raise ValueError(f"Unsupported file type: {file_type}") def _process_txt(self, filepath): with open(filepath, 'r', encoding='utf-8') as f: return f.read() def _process_pdf(self, filepath): text = "" with open(filepath, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += page.extract_text() return text def _process_docx(self, filepath): doc = Document(filepath) return '\n'.join([para.text for para in doc.paragraphs])