File size: 2,615 Bytes
8397f09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
from typing import Dict, List, Any

class EnhancedLegalProcessor:
    def __init__(self):
        # Patterns for different document elements
        self.table_pattern = re.compile(r'(\|\s*[^\n]+\s*\|(?:\n\|\s*[^\n]+\s*\|)+)')
        self.list_pattern = re.compile(r'(?:^|\n)(?:\d+\.|\*|\-)\s+[^\n]+(?:\n(?:\d+\.|\*|\-)\s+[^\n]+)*')
        self.formula_pattern = re.compile(r'\$[^$]+\$')
        self.abbreviation_pattern = re.compile(r'\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b')
        
    def process_document(self, text: str) -> Dict[str, Any]:
        """Process a legal document and extract various elements."""
        return {
            "tables": self._extract_tables(text),
            "lists": self._extract_lists(text),
            "formulas": self._extract_formulas(text),
            "abbreviations": self._extract_abbreviations(text),
            "definitions": self._extract_definitions(text),
            "cleaned_text": self._clean_text(text)
        }
    
    def _extract_tables(self, text: str) -> List[str]:
        """Extract tables from the text."""
        return self.table_pattern.findall(text)
    
    def _extract_lists(self, text: str) -> List[str]:
        """Extract lists from the text."""
        return self.list_pattern.findall(text)
    
    def _extract_formulas(self, text: str) -> List[str]:
        """Extract mathematical formulas from the text."""
        return self.formula_pattern.findall(text)
    
    def _extract_abbreviations(self, text: str) -> List[str]:
        """Extract abbreviations from the text."""
        return self.abbreviation_pattern.findall(text)
    
    def _extract_definitions(self, text: str) -> Dict[str, str]:
        """Extract definitions from the text."""
        definitions = {}
        # Pattern for "X means Y" or "X shall mean Y"
        definition_pattern = re.compile(r'([A-Z][A-Za-z\s]+)(?:\s+means|\s+shall\s+mean)\s+([^\.]+)')
        
        for match in definition_pattern.finditer(text):
            term = match.group(1).strip()
            definition = match.group(2).strip()
            definitions[term] = definition
            
        return definitions
    
    def _clean_text(self, text: str) -> str:
        """Clean the text by removing unnecessary whitespace and formatting."""
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        # Remove multiple newlines
        text = re.sub(r'\n+', '\n', text)
        # Remove leading/trailing whitespace
        text = text.strip()
        return text

# Create a singleton instance
enhanced_legal_processor = EnhancedLegalProcessor()