hackrx-llm-document-processor / document_processor.py
PalakMeena's picture
Upload 9 files
aaa7449 verified
Raw
History Blame Contribute Delete
3.2 kB
import PyPDF2
from docx import Document
import os
from typing import List
class DocumentProcessor:
def __init__(self):
self.chunk_size = 200 # Smaller chunks for better matching
self.chunk_overlap = 50
def process_document(self, file_path: str) -> List[str]:
"""Process document and return text chunks"""
try:
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
text = self._extract_pdf_text(file_path)
elif file_extension in ['.docx', '.doc']:
text = self._extract_docx_text(file_path)
elif file_extension == '.txt':
text = self._extract_txt_text(file_path)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
# Split into chunks
chunks = self._split_text(text)
# Clean up temp file (but keep sample_policy.txt)
if os.path.exists(file_path) and "sample_policy" not in file_path:
os.remove(file_path)
return chunks
except Exception as e:
print(f"Error processing document: {e}")
return []
def _extract_txt_text(self, file_path: str) -> str:
"""Extract text from .txt file"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
print(f"Error reading TXT: {e}")
return ""
def _extract_pdf_text(self, file_path: str) -> str:
"""Extract text from PDF"""
text = ""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
except Exception as e:
print(f"Error reading PDF: {e}")
return text
def _extract_docx_text(self, file_path: str) -> str:
"""Extract text from Word document"""
text = ""
try:
doc = Document(file_path)
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
except Exception as e:
print(f"Error reading DOCX: {e}")
return text
def _split_text(self, text: str) -> List[str]:
"""Split text into smaller, more focused chunks"""
# Split by lines first, then combine
lines = [line.strip() for line in text.split('\n') if line.strip()]
chunks = []
current_chunk = ""
for line in lines:
if len(current_chunk + line) < self.chunk_size:
current_chunk += line + " "
else:
if current_chunk.strip():
chunks.append(current_chunk.strip())
current_chunk = line + " "
# Add the last chunk
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks