|
|
import re |
|
|
from typing import List |
|
|
|
|
|
|
|
|
def segment_into_clauses(full_text: str) -> List[str]: |
|
|
""" |
|
|
Segment the full document text into individual clauses using regex patterns. |
|
|
Looks for common clause patterns like "1.", "1.1", "(a)", etc. |
|
|
""" |
|
|
|
|
|
|
|
|
clause_patterns = [ |
|
|
r'\n\s*\d+\.\s+', |
|
|
r'\n\s*\d+\.\d+\s+', |
|
|
r'\n\s*\(\w+\)\s+', |
|
|
r'\n\s*[ivx]+\.\s+', |
|
|
r'\n\s*[IVX]+\.\s+', |
|
|
r'\n\s*Article\s+\d+\s*:', |
|
|
r'\n\s*Section\s+\d+\s*:', |
|
|
r'\n\s*Clause\s+\d+\s*:', |
|
|
] |
|
|
|
|
|
|
|
|
combined_pattern = '|'.join(clause_patterns) |
|
|
|
|
|
|
|
|
clauses = re.split(combined_pattern, full_text) |
|
|
|
|
|
|
|
|
cleaned_clauses = [] |
|
|
for clause in clauses: |
|
|
clause = clause.strip() |
|
|
if len(clause) > 50: |
|
|
cleaned_clauses.append(clause) |
|
|
|
|
|
|
|
|
if len(cleaned_clauses) <= 1: |
|
|
|
|
|
simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text) |
|
|
cleaned_clauses = [clause.strip() |
|
|
for clause in simple_clauses if len(clause.strip()) > 50] |
|
|
|
|
|
return cleaned_clauses |
|
|
|