# file_processor/processor.py


import PyPDF2
from docx import Document
import magic
import os

class FileProcessor:
    def process_file(self, filepath):
        file_type = magic.from_file(filepath, mime=True)
        
        if file_type == 'text/plain':
            return self._process_txt(filepath)
        elif file_type == 'application/pdf':
            return self._process_pdf(filepath)
        elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            return self._process_docx(filepath)
        else:
            raise ValueError(f"Unsupported file type: {file_type}")

    def _process_txt(self, filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()

    def _process_pdf(self, filepath):
        text = ""
        with open(filepath, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text()
        return text

    def _process_docx(self, filepath):
        doc = Document(filepath)
        return '\n'.join([para.text for para in doc.paragraphs])