Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List | |
| def segment_into_clauses(full_text: str) -> List[str]: | |
| """ | |
| Segment the full document text into individual clauses using regex patterns. | |
| Looks for common clause patterns like "1.", "1.1", "(a)", etc. | |
| """ | |
| # Define regex patterns for different clause formats | |
| clause_patterns = [ | |
| r'\n\s*\d+\.\s+', # "1. ", "2. ", etc. | |
| r'\n\s*\d+\.\d+\s+', # "1.1 ", "1.2 ", etc. | |
| r'\n\s*\(\w+\)\s+', # "(a) ", "(b) ", etc. | |
| r'\n\s*[ivx]+\.\s+', # "i. ", "ii. ", "iii. ", etc. | |
| r'\n\s*[IVX]+\.\s+', # "I. ", "II. ", "III. ", etc. | |
| r'\n\s*Article\s+\d+\s*:', # "Article 1:", "Article 2:", etc. | |
| r'\n\s*Section\s+\d+\s*:', # "Section 1:", "Section 2:", etc. | |
| r'\n\s*Clause\s+\d+\s*:', # "Clause 1:", "Clause 2:", etc. | |
| ] | |
| # Combine all patterns with OR operator | |
| combined_pattern = '|'.join(clause_patterns) | |
| # Split text using the combined pattern | |
| clauses = re.split(combined_pattern, full_text) | |
| # Clean up the clauses | |
| cleaned_clauses = [] | |
| for clause in clauses: | |
| clause = clause.strip() | |
| if len(clause) > 50: # Only include substantial clauses | |
| cleaned_clauses.append(clause) | |
| # If no clauses were found with the patterns, try a simpler approach | |
| if len(cleaned_clauses) <= 1: | |
| # Split by double newlines or periods followed by newlines | |
| simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text) | |
| cleaned_clauses = [clause.strip() | |
| for clause in simple_clauses if len(clause.strip()) > 50] | |
| return cleaned_clauses | |