Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import fitz # PyMuPDF | |
from docx import Document | |
# import mailparser | |
from urllib.parse import urlparse | |
import re | |
# === Base Abstract Class === | |
class DocumentLoader: | |
def __init__(self, source: str): | |
self.source = source | |
def extract(self): | |
raise NotImplementedError("This method should be implemented in child classes.") | |
# === PDF Loader (Handles Local + URL) === | |
class PDFLoader(DocumentLoader): | |
def __init__(self, source: str): | |
super().__init__(source) | |
self.is_url = self.source.startswith("http") | |
def _download_pdf(self): | |
local_path = "temp_blob.pdf" | |
response = requests.get(self.source) | |
if response.status_code == 200: | |
with open(local_path, 'wb') as f: | |
f.write(response.content) | |
return local_path | |
else: | |
raise Exception(f"Failed to download PDF. Status: {response.status_code}") | |
def extract(self): | |
pdf_path = self._download_pdf() if self.is_url else self.source | |
doc = fitz.open(pdf_path) | |
clauses = [] | |
current_heading = None | |
current_chunk = [] | |
page_number = 0 | |
heading_pattern = re.compile(r'^(\d+(\.\d+)*[\s\-:]?|[A-Z][A-Z\s]{4,})') # e.g., 3.1, 2.3.5, or UPPER TITLES | |
for page in doc: | |
page_number += 1 | |
blocks = page.get_text("blocks") | |
blocks = sorted(blocks, key=lambda b: (b[1], b[0])) # Sort top-to-bottom, left-to-right | |
for b in blocks: | |
text = b[4].strip() | |
if not text or len(text) < 20: | |
continue | |
if heading_pattern.match(text): | |
# Flush previous chunk | |
if current_chunk: | |
combined = " ".join(current_chunk).strip() | |
clauses.append({ | |
"heading": current_heading, | |
"text": combined, | |
"page": page_number | |
}) | |
current_chunk = [] | |
current_heading = text # New heading found | |
else: | |
current_chunk.append(text) | |
# Final chunk flush | |
if current_chunk: | |
combined = " ".join(current_chunk).strip() | |
clauses.append({ | |
"heading": current_heading, | |
"text": combined, | |
"page": page_number | |
}) | |
doc.close() | |
if self.is_url and os.path.exists(pdf_path): | |
os.remove(pdf_path) | |
return clauses | |
# === DOCX Loader === | |
class DOCXLoader(DocumentLoader): | |
def extract(self): | |
doc = Document(self.source) | |
clauses = [] | |
for i, para in enumerate(doc.paragraphs): | |
text = para.text.strip() | |
if text: | |
clauses.append({ | |
"text": text, | |
"style": para.style.name, | |
"position": i + 1 | |
}) | |
return clauses | |
# === Email Loader (.eml files) === | |
# class EmailLoader(DocumentLoader): | |
# def extract(self): | |
# mail = mailparser.parse_from_file(self.source) | |
# return [{ | |
# "subject": mail.subject, | |
# "from": mail.from_[0][1] if mail.from_ else "", | |
# "to": mail.to[0][1] if mail.to else "", | |
# "text": mail.body, | |
# "date": str(mail.date) | |
# }] | |
# === Main Wrapper Function === | |
def load_document(source: str): | |
parsed = urlparse(source) | |
if source.endswith(".pdf") or parsed.scheme.startswith("http"): | |
loader = PDFLoader(source) | |
elif source.endswith(".docx"): | |
loader = DOCXLoader(source) | |
# elif source.endswith(".eml"): | |
# loader = EmailLoader(source) | |
else: | |
raise ValueError("Unsupported file format or source type.") | |
content = loader.extract() | |
return { | |
"source": source, | |
"clauses": content | |
} | |
# if __name__ == '__main__': | |
# output = load_document('https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D') | |
# print("hello") | |
# print(output['clauses'][4]) |