Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import re | |
| class LegalDocProcessor: | |
| def __init__(self, parent_path, child_path): | |
| self.parent_path = parent_path | |
| self.child_path = child_path | |
| # RECOMENDED: Simplify these to base keywords for maximum "looseness" | |
| self.allowed_sources = [ | |
| "Constitution", | |
| "Criminal Code", | |
| "Civil Code", | |
| "Electronic Transactions", | |
| "Domestic Violence", | |
| "Human Trafficking", | |
| "Motor Vehicles", | |
| "Labor Act", | |
| "Income Tax", | |
| "Banking", | |
| "Consumer Protection", | |
| "Environment", | |
| "Citizenship", | |
| "Witchcraft", | |
| "Acid", | |
| "Muluki Ain", | |
| "Land Act", | |
| "Public Health", | |
| "Copyright Act", | |
| "Education Act", | |
| "Public Health", | |
| "Banks", | |
| "Companies Act", | |
| "Muluki Civil", | |
| "Children's Act", | |
| "National Women Commission", | |
| "Public", | |
| "Discrimination", | |
| "Social", | |
| "Motherhood", | |
| "Sexual Harassment", | |
| "Sexual Harassment at the Workplace (Elimination) Act, 2015" | |
| ] | |
| def _get_base_clause(self, clause_id): | |
| if not clause_id: return None | |
| match = re.match(r"([0-9A-Za-z]+)", str(clause_id)) | |
| return match.group(1) if match else str(clause_id) | |
| # NEW HELPER: Reusable loose check | |
| def _is_source_allowed(self, src_name): | |
| if not src_name: return False | |
| src_lower = str(src_name).lower() | |
| return any(allowed.lower() in src_lower for allowed in self.allowed_sources) | |
| def load_and_clean(self): | |
| parent_lookup = {} | |
| processed_docs = [] | |
| # 1. PROCESS PARENTS (Now with loose matching) | |
| if os.path.exists(self.parent_path): | |
| with open(self.parent_path, 'r', encoding='utf-8') as f: | |
| parents = json.load(f) | |
| for p in parents: | |
| src = p.get('legal_document_source', "").strip() | |
| # LOOSE CHECK APPLIED HERE | |
| if self._is_source_allowed(src): | |
| cid = str(p.get('clause_id')).strip().lower() | |
| # Use (src, cid) to match exactly how children identify parents | |
| parent_lookup[(src, cid)] = p.get('text') | |
| # 2. PROCESS CHILDREN | |
| if os.path.exists(self.child_path): | |
| with open(self.child_path, 'r', encoding='utf-8') as f: | |
| children = json.load(f) | |
| for child in children: | |
| src = child.get('legal_document_source', "").strip() | |
| # LOOSE CHECK APPLIED HERE | |
| if not self._is_source_allowed(src): | |
| continue | |
| raw_id = str(child.get('clause_id')).strip().lower() | |
| raw_p_id = str(child.get('parent_clause_id') or child.get('clause_id')).strip().lower() | |
| base_p_id = self._get_base_clause(raw_p_id).lower() | |
| # Try to find parent using the exact source name found in this chunk | |
| p_text = parent_lookup.get((src, raw_p_id)) or \ | |
| parent_lookup.get((src, base_p_id), "Parent context not found.") | |
| processed_docs.append({ | |
| "search_content": child.get('text', ""), | |
| "metadata": { | |
| "clause_id": raw_id, | |
| "text": child.get('text'), | |
| "legal_document_source": src, | |
| "parent_clause_id": base_p_id, | |
| "parent_clause_text": p_text, | |
| "chapter": child.get('chapter', ""), | |
| "part": child.get('part', "") | |
| } | |
| }) | |
| return processed_docs |