Spaces:
Runtime error
Runtime error
File size: 2,615 Bytes
8397f09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import re
from typing import Dict, List, Any
class EnhancedLegalProcessor:
def __init__(self):
# Patterns for different document elements
self.table_pattern = re.compile(r'(\|\s*[^\n]+\s*\|(?:\n\|\s*[^\n]+\s*\|)+)')
self.list_pattern = re.compile(r'(?:^|\n)(?:\d+\.|\*|\-)\s+[^\n]+(?:\n(?:\d+\.|\*|\-)\s+[^\n]+)*')
self.formula_pattern = re.compile(r'\$[^$]+\$')
self.abbreviation_pattern = re.compile(r'\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b')
def process_document(self, text: str) -> Dict[str, Any]:
"""Process a legal document and extract various elements."""
return {
"tables": self._extract_tables(text),
"lists": self._extract_lists(text),
"formulas": self._extract_formulas(text),
"abbreviations": self._extract_abbreviations(text),
"definitions": self._extract_definitions(text),
"cleaned_text": self._clean_text(text)
}
def _extract_tables(self, text: str) -> List[str]:
"""Extract tables from the text."""
return self.table_pattern.findall(text)
def _extract_lists(self, text: str) -> List[str]:
"""Extract lists from the text."""
return self.list_pattern.findall(text)
def _extract_formulas(self, text: str) -> List[str]:
"""Extract mathematical formulas from the text."""
return self.formula_pattern.findall(text)
def _extract_abbreviations(self, text: str) -> List[str]:
"""Extract abbreviations from the text."""
return self.abbreviation_pattern.findall(text)
def _extract_definitions(self, text: str) -> Dict[str, str]:
"""Extract definitions from the text."""
definitions = {}
# Pattern for "X means Y" or "X shall mean Y"
definition_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)')
for match in definition_pattern.finditer(text):
term = match.group(1).strip()
definition = match.group(2).strip()
definitions[term] = definition
return definitions
def _clean_text(self, text: str) -> str:
"""Clean the text by removing unnecessary whitespace and formatting."""
# Remove multiple spaces
text = re.sub(r'\s+', ' ', text)
# Remove multiple newlines
text = re.sub(r'\n+', '\n', text)
# Remove leading/trailing whitespace
text = text.strip()
return text
# Create a singleton instance
enhanced_legal_processor = EnhancedLegalProcessor() |