Spaces:

JAYASREESS
/

final_year

Running

App Files Files Community

final_year / analysis /common_analyzer.py

jayasrees

first commit

9d21edd about 1 month ago

raw

history blame contribute delete

12.2 kB

	"""
	Strict Domain Analyzer for Legal Documents.
	Implements specific checks for:
	- Entity Roles (Vendor vs Vendee)
	- Domain Categories (Financial, Possession, Ownership, etc.)
	- Timeline Logic (Agreement vs Registration)
	- Numeric Consistency within context
	"""

	import re

	# =========================
	# 1. STRICT CLASSIFICATION
	# =========================

	def is_legal_boilerplate(text):
	"""Detects standard legal headers, footers, and witness blocks."""
	t = text.lower()
	patterns = [
	"in witness whereof", "signed and delivered", "witnesses:",
	"schedule", "jurisdiction", "arbitration", "notice",
	"all that piece and parcel", "north by", "south by"
	]
	# If it's very short (< 5 words) and contains a keyword
	words = t.split()
	if len(words) < 5 and any(p in t for p in patterns):
	return True

	# If it's just a signature block
	if "signed by" in t or "witness" in t:
	return True

	return False

	def get_clause_domain(text):
	"""
	Classify clause into strict legal domains.
	Returns: 'FINANCIAL', 'POSSESSION', 'OWNERSHIP', 'ENCUMBRANCE', 'ADMINISTRATIVE', 'RECITAL', 'DEFINITION', 'OPERATIVE' or 'GENERAL'
	"""
	t = text.lower()

	# 1. RECITAL (Background)
	if t.startswith("whereas") or "and whereas" in t:
	return "RECITAL"

	# 2. DEFINITION
	if "shall mean" in t or "expression vendor" in t or "expression vendee" in t:
	return "DEFINITION"

	# 3. FINANCIAL (Money, Consideration)
	if any(w in t for w in ["rs.", "rupees", "paid", "consideration", "sum of", "amount", "price", "cheque", "bank"]):
	return "FINANCIAL"

	# 4. POSSESSION (Handover, Vacant)
	if any(w in t for w in ["possession", "handed over", "delivered", "vacant"]):
	return "POSSESSION"

	# 5. OWNERSHIP / TITLE
	if any(w in t for w in ["owner", "title", "interest", "rights", "absolute", "fee simple"]):
	return "OWNERSHIP"

	# 6. ENCUMBRANCE (Loans, Mortgages)
	if any(w in t for w in ["encumbrance", "mortgage", "loan", "charge", "lien", "litigation"]):
	return "ENCUMBRANCE"

	# 7. ADMINISTRATIVE (Boilerplate)
	if any(w in t for w in ["witness", "signed", "schedule", "jurisdiction", "arbitration", "notice"]):
	return "ADMINISTRATIVE"

	# 8. OPERATIVE (Action)
	if t.startswith("that") or "hereby" in t or "now this deed" in t:
	return "OPERATIVE"

	return "GENERAL"

	def get_entities(text):
	"""
	Strictly detect if clause belongs to a specific entity.
	"""
	t = text.lower()
	entities = set()
	if "vendor" in t: entities.add("Vendor")
	if "vendee" in t: entities.add("Vendee")
	return entities

	# =========================
	# 2. EXTRACTION HELPERS
	# =========================

	def extract_numbers(text):
	"""Extract numeric values for comparison."""
	# Matches Rs. 100, 1,00,000, 500 sq ft (just the numbers)
	return [int(n.replace(",", "")) for n in re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)]

	def has_negation(text):
	neg_words = ["not", "never", "no", "cannot", "must not", "shall not"]
	return any(w in text.lower() for w in neg_words)

	def has_exception_language(text):
	"""Detects legal exception/qualification identifiers."""
	qualifiers = [
	"subject to", "notwithstanding", "except as provided",
	"unless otherwise", "provided however", "without prejudice"
	]
	return any(q in text.lower() for q in qualifiers)

	def is_definition(text):
	"""Strictly checks if a clause is a definition."""
	t = text.lower()
	if "shall mean" in t or "means" in t or "defined as" in t:
	return True
	return False

	def is_party_intro(text):
	"""Detects if a clause is just listing a party description."""
	t = text.lower()

	# Strong Indicators: Address patterns, Relations, IDs
	# Regex for "Door No", "D.No", "residing at"
	address_pattern = r"(door\sno\|d\.no\|residing\sat\|post\s,\svillage)"

	# Regex for relations: "son of", "wife of", "daughter of", "w/o", "s/o", or just "son", "wife" in context
	relation_pattern = r"\b(son\|wife\|daughter\|husband\|father\|mother\|s/o\|w/o\|d/o)\b"

	# Regex for IDs: "aadhaar", "pan no", "id card"
	id_pattern = r"(aadhaar\|pan\sno\|id\scard\|mobile\s*no)"

	# Check for presence of these patterns
	has_address = re.search(address_pattern, t)
	has_relation = re.search(relation_pattern, t)
	has_id = re.search(id_pattern, t)

	# If it has at least 2 strong components (e.g. Relation + ID, or Address + Relation), it's a bio
	score = 0
	if has_address: score += 1
	if has_relation: score += 1
	if has_id: score += 1

	return score >= 2

	# =========================
	# 3. CORE LOGIC GATES
	# =========================

	def analyze_pair(text1, text2, similarity, threshold=0.75):
	"""
	Strict Analyzer returning (Label, Score, Reason).
	Args:
	threshold: Minimum similarity score to consider as CANDIDATE (default 0.75)
	"""
	# Force Reload Trigger

	# --- GATE 0: BOILERPLATE CHECK ---
	if is_legal_boilerplate(text1) or is_legal_boilerplate(text2):
	return None, 0.0, "Boilerplate (Skipped)"

	# --- GATE 1: DOMAIN MISMATCH ---
	d1 = get_clause_domain(text1)
	d2 = get_clause_domain(text2)

	# If domains are totally different, SKIP.
	# Exception: OPERATIVE and GENERAL might overlap, but strictly FINANCIAL vs POSSESSION should skip.
	if d1 != "GENERAL" and d2 != "GENERAL" and d1 != d2:
	# RELAXATION: Only bypass if similarity is VERY high (suggesting misclassification).
	# Otherwise, DO NOT compare apples (Financial) to oranges (Possession),
	# even in Deep Search mode.
	if similarity < 0.85:
	return None, 0.0, "Domain Mismatch"

	# --- HARDENED CHECK: GENERAL vs SPECIFIC ---
	# Common source of noise: "Any other details" matching "The price is Rs 100"
	# Block GENERAL vs Specific unless similarity is high
	if (d1 == "GENERAL" and d2 != "GENERAL") or (d2 == "GENERAL" and d1 != "GENERAL"):
	if similarity < 0.80:
	return None, 0.0, "General vs Specific Domain (Skipped)"

	# --- SPECIFIC FILTER: MONEY vs TIMELINE ---
	# Prevents "Price is X" vs "Payment due on Date Y" (confusing numbers/dates)
	# Check if one clause is purely FINANCIAL and other is purely TIMELINE/DATE based
	is_financial = d1 == "FINANCIAL" or d2 == "FINANCIAL"
	has_date = re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text1) or \
	re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text2)

	if is_financial and has_date:
	# If one talks about Price/Amount and other has a Date,
	# unless they are explicitly about "Payment Schedule", they are likely different.
	if "schedule" not in text1.lower() and "schedule" not in text2.lower():
	if similarity < 0.85:
	return None, 0.0, "Financial vs Timeline Mismatch"

	# --- SPECIFIC FILTER: ELIGIBILITY vs ASSISTANCE ---
	# Prevents "Eligibility criteria" vs "Assistance details" (Common in schemes)
	# Check for keywords like "eligible", "qualify" vs "grant", "support", "help"
	t1_lower, t2_lower = text1.lower(), text2.lower()
	is_eligibility = any(w in t1_lower for w in ["eligible", "qualify", "criteria", "requirement"]) or \
	any(w in t2_lower for w in ["eligible", "qualify", "criteria", "requirement"])
	is_assistance = any(w in t1_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) or \
	any(w in t2_lower for w in ["provide", "grant", "subsidy", "support", "assistance"])

	if is_eligibility and is_assistance:
	# Unless precise overlap, these are distinct sections
	if similarity < 0.85:
	return None, 0.0, "Eligibility vs Assistance Mismatch"

	# --- GATE 1.5: PARTY DESCRIPTION CHECK ---
	# If both clauses are just descriptions of people (addresses, relations), skip.
	if is_party_intro(text1) and is_party_intro(text2):
	return None, 0.0, "Party Description (Skipped)"

	# --- GATE 2: ENTITY MISMATCH ---
	e1 = get_entities(text1)
	e2 = get_entities(text2)
	# If one is Vendor ONLY and other is Vendee ONLY -> SKIP
	if e1 and e2 and e1 != e2 and not (e1 & e2):
	# RELAXATION: Only bypass if similarity is VERY high.
	if similarity < 0.85:
	return None, 0.0, "Entity Role Mismatch"

	# --- GATE 2.5: DEFINITION GUARD ---
	# Don't compare definitions with operative clauses generally
	if is_definition(text1) or is_definition(text2):
	# Only compare if both are definitions (conflicting definitions)
	if not (is_definition(text1) and is_definition(text2)):
	return None, 0.0, "Definition vs Operative"

	# --- GATE 3: POSSESSION TIMELINE ---
	# "Possession at agreement" vs "Possession at registration" is NOT a contradiction.
	if d1 == "POSSESSION" and d2 == "POSSESSION":
	keywords_a = ["agreement", "earnest"]
	keywords_b = ["registration", "sale deed", "final"]

	has_a = any(k in text1.lower() for k in keywords_a)
	has_b = any(k in text2.lower() for k in keywords_b)

	# If one talks about start and other about end, it's a sequence.
	if (has_a and any(k in text2.lower() for k in keywords_b)) or \
	(has_b and any(k in text1.lower() for k in keywords_a)):
	return None, 0.0, "Possession Timeline Sequence"

	# --- GATE 4: NUMERIC REASONING ---
	# Only compare numbers if context allows
	nums1 = extract_numbers(text1)
	nums2 = extract_numbers(text2)

	if nums1 and nums2 and nums1 != nums2:
	# MAGNITUDE CHECK: If numbers differ by > 100x, likely different units (e.g. Price vs Area)
	# e.g. 5,50,000 vs 1.25 -> Ratio is huge.
	max1, max2 = max(nums1), max(nums2)
	if max1 > 0 and max2 > 0:
	ratio = max1 / max2 if max1 > max2 else max2 / max1
	if ratio > 100:
	return None, 0.0, "Numeric Magnitude Mismatch (Likely Unit Diff)"

	# Check if they are in the same domain (likely valid comparison)
	if d1 == d2 and d1 != "GENERAL":
	return "NUMERIC_INCONSISTENCY", 0.9, f"Mismatch in {d1} values"

	# If General, be careful.
	# But if similarity is VERY high, it might be a contradiction.
	if similarity > 0.9:
	return "NUMERIC_INCONSISTENCY", 0.85, "Numeric Mismatch in similar context"

	# --- GATE 4.5: EXCEPTION/HIERARCHY CHECK ---
	# If high similarity but one has exception language
	# We use a slightly lower threshold for exception detection to be safe
	exception_threshold = max(0.65, threshold - 0.05)
	if similarity > exception_threshold:
	has_ex1 = has_exception_language(text1)
	has_ex2 = has_exception_language(text2)

	if (has_ex1 and not has_ex2) or (has_ex2 and not has_ex1):
	return "QUALIFICATION", similarity, "Legal Exception/Qualification detected (Not a Conflict)"

	# --- GATE 5: LOGICAL NEGATION ---
	if (has_negation(text1) and not has_negation(text2)) or \
	(has_negation(text2) and not has_negation(text1)):
	# Only flag if high similarity implies they are talking about the same thing
	# Negation check requires fairly high confidence they are related
	if similarity > 0.85:
	return "LEGAL_CONFLICT", 0.8, "Logical Negation detected"

	# --- FINAL GATE: CANDIDATE FOR NLI ---
	# If we are here, we passed the blocks.
	# If similarity is high, let NLI decide.
	if similarity > threshold:
	return "CANDIDATE", similarity, "High Similarity - Pending NLI"

	return None, 0.0, "Low Similarity"