MajorProjectRAG / src /data_processor.py
Prasanga73's picture
Upload 3 files
67dbff4 verified
import json
import os
import re
class LegalDocProcessor:
def __init__(self, parent_path, child_path):
self.parent_path = parent_path
self.child_path = child_path
# RECOMENDED: Simplify these to base keywords for maximum "looseness"
self.allowed_sources = [
"Constitution",
"Criminal Code",
"Civil Code",
"Electronic Transactions",
"Domestic Violence",
"Human Trafficking",
"Motor Vehicles",
"Labor Act",
"Income Tax",
"Banking",
"Consumer Protection",
"Environment",
"Citizenship",
"Witchcraft",
"Acid",
"Muluki Ain",
"Land Act",
"Public Health",
"Copyright Act",
"Education Act",
"Public Health",
"Banks",
"Companies Act",
"Muluki Civil",
"Children's Act",
"National Women Commission",
"Public",
"Discrimination",
"Social",
"Motherhood",
"Sexual Harassment",
"Sexual Harassment at the Workplace (Elimination) Act, 2015"
]
def _get_base_clause(self, clause_id):
if not clause_id: return None
match = re.match(r"([0-9A-Za-z]+)", str(clause_id))
return match.group(1) if match else str(clause_id)
# NEW HELPER: Reusable loose check
def _is_source_allowed(self, src_name):
if not src_name: return False
src_lower = str(src_name).lower()
return any(allowed.lower() in src_lower for allowed in self.allowed_sources)
def load_and_clean(self):
parent_lookup = {}
processed_docs = []
# 1. PROCESS PARENTS (Now with loose matching)
if os.path.exists(self.parent_path):
with open(self.parent_path, 'r', encoding='utf-8') as f:
parents = json.load(f)
for p in parents:
src = p.get('legal_document_source', "").strip()
# LOOSE CHECK APPLIED HERE
if self._is_source_allowed(src):
cid = str(p.get('clause_id')).strip().lower()
# Use (src, cid) to match exactly how children identify parents
parent_lookup[(src, cid)] = p.get('text')
# 2. PROCESS CHILDREN
if os.path.exists(self.child_path):
with open(self.child_path, 'r', encoding='utf-8') as f:
children = json.load(f)
for child in children:
src = child.get('legal_document_source', "").strip()
# LOOSE CHECK APPLIED HERE
if not self._is_source_allowed(src):
continue
raw_id = str(child.get('clause_id')).strip().lower()
raw_p_id = str(child.get('parent_clause_id') or child.get('clause_id')).strip().lower()
base_p_id = self._get_base_clause(raw_p_id).lower()
# Try to find parent using the exact source name found in this chunk
p_text = parent_lookup.get((src, raw_p_id)) or \
parent_lookup.get((src, base_p_id), "Parent context not found.")
processed_docs.append({
"search_content": child.get('text', ""),
"metadata": {
"clause_id": raw_id,
"text": child.get('text'),
"legal_document_source": src,
"parent_clause_id": base_p_id,
"parent_clause_text": p_text,
"chapter": child.get('chapter', ""),
"part": child.get('part', "")
}
})
return processed_docs