Spaces:
Running
Running
| """ | |
| Precedent Chain Builder — Runtime Module. | |
| Loads citation graph built offline by preprocessing/build_citation_graph.py. | |
| At query time, enriches retrieved chunks with cited predecessor judgments. | |
| WHY: | |
| Indian SC judgments build on each other. A 1984 judgment establishing | |
| a key principle was itself built on a 1971 judgment. Showing the user | |
| the reasoning chain across cases makes NyayaSetu feel like a legal | |
| researcher, not a search engine. | |
| The graph is loaded once at startup and kept in memory. | |
| Lookup is O(1) dict access — negligible runtime cost. | |
| """ | |
| import os | |
| import json | |
| import re | |
| import logging | |
| from typing import List, Dict, Optional | |
| logger = logging.getLogger(__name__) | |
| # ── Graph store ─────────────────────────────────────────── | |
| _graph = {} # judgment_id -> [citation_strings] | |
| _reverse_graph = {} # citation_string -> [judgment_ids] | |
| _title_to_id = {} # normalised_title -> judgment_id | |
| _parent_store = {} # judgment_id -> text (loaded from parent_judgments.jsonl) | |
| _loaded = False | |
| def load_citation_graph( | |
| graph_path: str = "data/citation_graph.json", | |
| reverse_path: str = "data/reverse_citation_graph.json", | |
| title_path: str = "data/title_to_id.json", | |
| parent_path: str = "data/parent_judgments.jsonl" | |
| ): | |
| """ | |
| Load all citation graph artifacts once at startup. | |
| Call from api/main.py after download_models(). | |
| Fails gracefully if files not found. | |
| """ | |
| global _graph, _reverse_graph, _title_to_id, _parent_store, _loaded | |
| try: | |
| if os.path.exists(graph_path): | |
| with open(graph_path) as f: | |
| _graph = json.load(f) | |
| logger.info(f"Citation graph loaded: {len(_graph)} judgments") | |
| else: | |
| logger.warning(f"Citation graph not found at {graph_path}") | |
| if os.path.exists(reverse_path): | |
| with open(reverse_path) as f: | |
| _reverse_graph = json.load(f) | |
| logger.info(f"Reverse citation graph loaded: {len(_reverse_graph)} citations") | |
| if os.path.exists(title_path): | |
| with open(title_path) as f: | |
| _title_to_id = json.load(f) | |
| logger.info(f"Title index loaded: {len(_title_to_id)} titles") | |
| # Load parent judgments for text retrieval | |
| if os.path.exists(parent_path): | |
| with open(parent_path) as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| j = json.loads(line) | |
| jid = j.get("judgment_id", "") | |
| if jid: | |
| _parent_store[jid] = j.get("text", "") | |
| except Exception: | |
| continue | |
| logger.info(f"Parent store loaded: {len(_parent_store)} judgments") | |
| _loaded = True | |
| except Exception as e: | |
| logger.error(f"Citation graph load failed: {e}. Precedent chain disabled.") | |
| _loaded = False | |
| def _resolve_citation_to_judgment(citation_string: str) -> Optional[str]: | |
| """ | |
| Try to match a citation string to a judgment_id. | |
| Uses multiple strategies in order of reliability. | |
| """ | |
| if not citation_string: | |
| return None | |
| # Strategy 1: Check reverse graph directly | |
| if citation_string in _reverse_graph: | |
| refs = _reverse_graph[citation_string] | |
| if refs: | |
| return refs[0] | |
| # Strategy 2: Normalise and check title index | |
| normalised = re.sub(r'[^\w\s]', '', citation_string.lower())[:50] | |
| if normalised in _title_to_id: | |
| return _title_to_id[normalised] | |
| # Strategy 3: Partial match on title index | |
| for title, jid in _title_to_id.items(): | |
| if len(normalised) > 10 and normalised[:20] in title: | |
| return jid | |
| return None | |
| def get_precedent_chain( | |
| judgment_ids: List[str], | |
| max_precedents: int = 3 | |
| ) -> List[Dict]: | |
| """ | |
| Given a list of retrieved judgment IDs, return their cited predecessors. | |
| Args: | |
| judgment_ids: IDs of judgments already retrieved by FAISS | |
| max_precedents: maximum number of precedent chunks to return | |
| Returns: | |
| List of precedent dicts with same structure as regular chunks, | |
| plus 'is_precedent': True and 'cited_by' field. | |
| """ | |
| if not _loaded or not _graph: | |
| return [] | |
| precedents = [] | |
| seen_ids = set(judgment_ids) | |
| for jid in judgment_ids: | |
| citations = _graph.get(jid, []) | |
| if not citations: | |
| continue | |
| for citation_ref in citations[:3]: # max 3 citations per judgment | |
| resolved_id = _resolve_citation_to_judgment(citation_ref) | |
| if not resolved_id or resolved_id in seen_ids: | |
| continue | |
| # Get text from parent store | |
| text = _parent_store.get(resolved_id, "") | |
| if not text: | |
| continue | |
| seen_ids.add(resolved_id) | |
| # Extract a useful excerpt — first 1500 chars after any header | |
| excerpt = text[:1500].strip() | |
| precedents.append({ | |
| "judgment_id": resolved_id, | |
| "chunk_id": f"{resolved_id}_precedent", | |
| "text": excerpt, | |
| "title": f"Precedent: {citation_ref[:80]}", | |
| "year": resolved_id.split("_")[1] if "_" in resolved_id else "", | |
| "source_type": "case_law", | |
| "is_precedent": True, | |
| "cited_by": jid, | |
| "citation_ref": citation_ref, | |
| "similarity_score": 0.5 # precedents are added, not ranked | |
| }) | |
| if len(precedents) >= max_precedents: | |
| break | |
| if len(precedents) >= max_precedents: | |
| break | |
| if precedents: | |
| logger.info(f"Precedent chain: added {len(precedents)} predecessor judgments") | |
| return precedents | |
| def get_citation_count(judgment_id: str) -> int: | |
| """How many times has this judgment been cited by others.""" | |
| count = 0 | |
| for citations in _graph.values(): | |
| for c in citations: | |
| resolved = _resolve_citation_to_judgment(c) | |
| if resolved == judgment_id: | |
| count += 1 | |
| return count | |
| def is_loaded() -> bool: | |
| return _loaded |