Spaces:

CaffeinatedCoding
/

nyayasetu

Running

App Files Files Community

nyayasetu / src /citation_graph.py

CaffeinatedCoding

Upload folder using huggingface_hub

8efa523 verified 14 days ago

raw

history blame contribute delete

6.36 kB

	"""
	Precedent Chain Builder — Runtime Module.

	Loads citation graph built offline by preprocessing/build_citation_graph.py.
	At query time, enriches retrieved chunks with cited predecessor judgments.

	WHY:
	Indian SC judgments build on each other. A 1984 judgment establishing
	a key principle was itself built on a 1971 judgment. Showing the user
	the reasoning chain across cases makes NyayaSetu feel like a legal
	researcher, not a search engine.

	The graph is loaded once at startup and kept in memory.
	Lookup is O(1) dict access — negligible runtime cost.
	"""

	import os
	import json
	import re
	import logging
	from typing import List, Dict, Optional

	logger = logging.getLogger(__name__)

	# ── Graph store ───────────────────────────────────────────
	_graph = {} # judgment_id -> [citation_strings]
	_reverse_graph = {} # citation_string -> [judgment_ids]
	_title_to_id = {} # normalised_title -> judgment_id
	_parent_store = {} # judgment_id -> text (loaded from parent_judgments.jsonl)
	_loaded = False


	def load_citation_graph(
	graph_path: str = "data/citation_graph.json",
	reverse_path: str = "data/reverse_citation_graph.json",
	title_path: str = "data/title_to_id.json",
	parent_path: str = "data/parent_judgments.jsonl"
	):
	"""
	Load all citation graph artifacts once at startup.
	Call from api/main.py after download_models().
	Fails gracefully if files not found.
	"""
	global _graph, _reverse_graph, _title_to_id, _parent_store, _loaded

	try:
	if os.path.exists(graph_path):
	with open(graph_path) as f:
	_graph = json.load(f)
	logger.info(f"Citation graph loaded: {len(_graph)} judgments")
	else:
	logger.warning(f"Citation graph not found at {graph_path}")

	if os.path.exists(reverse_path):
	with open(reverse_path) as f:
	_reverse_graph = json.load(f)
	logger.info(f"Reverse citation graph loaded: {len(_reverse_graph)} citations")

	if os.path.exists(title_path):
	with open(title_path) as f:
	_title_to_id = json.load(f)
	logger.info(f"Title index loaded: {len(_title_to_id)} titles")

	# Load parent judgments for text retrieval
	if os.path.exists(parent_path):
	with open(parent_path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	j = json.loads(line)
	jid = j.get("judgment_id", "")
	if jid:
	_parent_store[jid] = j.get("text", "")
	except Exception:
	continue
	logger.info(f"Parent store loaded: {len(_parent_store)} judgments")

	_loaded = True

	except Exception as e:
	logger.error(f"Citation graph load failed: {e}. Precedent chain disabled.")
	_loaded = False


	def _resolve_citation_to_judgment(citation_string: str) -> Optional[str]:
	"""
	Try to match a citation string to a judgment_id.
	Uses multiple strategies in order of reliability.
	"""
	if not citation_string:
	return None

	# Strategy 1: Check reverse graph directly
	if citation_string in _reverse_graph:
	refs = _reverse_graph[citation_string]
	if refs:
	return refs[0]

	# Strategy 2: Normalise and check title index
	normalised = re.sub(r'[^\w\s]', '', citation_string.lower())[:50]
	if normalised in _title_to_id:
	return _title_to_id[normalised]

	# Strategy 3: Partial match on title index
	for title, jid in _title_to_id.items():
	if len(normalised) > 10 and normalised[:20] in title:
	return jid

	return None


	def get_precedent_chain(
	judgment_ids: List[str],
	max_precedents: int = 3
	) -> List[Dict]:
	"""
	Given a list of retrieved judgment IDs, return their cited predecessors.

	Args:
	judgment_ids: IDs of judgments already retrieved by FAISS
	max_precedents: maximum number of precedent chunks to return

	Returns:
	List of precedent dicts with same structure as regular chunks,
	plus 'is_precedent': True and 'cited_by' field.
	"""
	if not _loaded or not _graph:
	return []

	precedents = []
	seen_ids = set(judgment_ids)

	for jid in judgment_ids:
	citations = _graph.get(jid, [])
	if not citations:
	continue

	for citation_ref in citations[:3]: # max 3 citations per judgment
	resolved_id = _resolve_citation_to_judgment(citation_ref)

	if not resolved_id or resolved_id in seen_ids:
	continue

	# Get text from parent store
	text = _parent_store.get(resolved_id, "")
	if not text:
	continue

	seen_ids.add(resolved_id)

	# Extract a useful excerpt — first 1500 chars after any header
	excerpt = text[:1500].strip()

	precedents.append({
	"judgment_id": resolved_id,
	"chunk_id": f"{resolved_id}_precedent",
	"text": excerpt,
	"title": f"Precedent: {citation_ref[:80]}",
	"year": resolved_id.split("_")[1] if "_" in resolved_id else "",
	"source_type": "case_law",
	"is_precedent": True,
	"cited_by": jid,
	"citation_ref": citation_ref,
	"similarity_score": 0.5 # precedents are added, not ranked
	})

	if len(precedents) >= max_precedents:
	break

	if len(precedents) >= max_precedents:
	break

	if precedents:
	logger.info(f"Precedent chain: added {len(precedents)} predecessor judgments")

	return precedents


	def get_citation_count(judgment_id: str) -> int:
	"""How many times has this judgment been cited by others."""
	count = 0
	for citations in _graph.values():
	for c in citations:
	resolved = _resolve_citation_to_judgment(c)
	if resolved == judgment_id:
	count += 1
	return count


	def is_loaded() -> bool:
	return _loaded