Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

ClauseGuard / chatbot.py

gaurv007

⚡ v4.3: Performance optimizations — ONNX INT8, BGE embedder, batched classification, thread control (#4)

f4b6528 12 days ago

raw

history blame contribute delete

16.8 kB

	"""
	ClauseGuard — Contract Q&A Chatbot (RAG) v1.0
	═══════════════════════════════════════════════
	Architecture:
	User asks question about their contract
	↓
	[1] Embed question with sentence-transformers (all-MiniLM-L6-v2)
	↓
	[2] Retrieve top-5 most relevant chunks from contract
	↓
	[3] Build prompt:
	- System: ClauseGuard analysis results (clauses, entities, risk scores)
	- Context: Retrieved contract chunks (≤2.5K tokens)
	- User question
	↓
	[4] Stream response from LLM via HF Inference API

	Key design:
	• Analyzed data (clauses, entities, risk scores) → system prompt
	• Raw contract text → RAG retrieval
	• This gives the model both structured analysis AND verbatim evidence
	"""

	import os
	import re
	import numpy as np

	# ── Embedding model (soft-fail) ─────────────────────────────────────
	_HAS_EMBEDDER = False
	_embedder = None

	try:
	from sentence_transformers import SentenceTransformer
	_HAS_EMBEDDER = True
	except ImportError:
	pass

	# ── HF Inference Client (soft-fail) ─────────────────────────────────
	_HAS_INFERENCE = False
	_llm_client = None

	try:
	from huggingface_hub import InferenceClient
	_HAS_INFERENCE = True
	except ImportError:
	pass

	# ═══════════════════════════════════════════════════════════════════════
	# MODEL LOADING
	# ═══════════════════════════════════════════════════════════════════════

	_chatbot_status = {"embedder": "not_loaded", "llm": "not_loaded"}

	def _load_embedder():
	"""Load sentence-transformers embedding model (lazy).
	PERF v4.3: Upgraded from all-MiniLM-L6-v2 to BAAI/bge-small-en-v1.5
	(+21% MTEB retrieval accuracy, same 384-dim, same latency)."""
	global _embedder, _chatbot_status
	if _embedder is not None:
	return _embedder
	if not _HAS_EMBEDDER:
	_chatbot_status["embedder"] = "unavailable"
	return None
	try:
	print("[ClauseGuard Chat] Loading embedding model: BAAI/bge-small-en-v1.5...")
	_embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")
	_chatbot_status["embedder"] = "loaded"
	print("[ClauseGuard Chat] Embedding model loaded (BGE-small, 384-dim)")
	return _embedder
	except Exception as e:
	_chatbot_status["embedder"] = f"failed: {e}"
	print(f"[ClauseGuard Chat] Embedder load failed: {e}")
	return None


	def _get_llm_client():
	"""Get or create HF Inference Client (lazy)."""
	global _llm_client, _chatbot_status
	if _llm_client is not None:
	return _llm_client
	if not _HAS_INFERENCE:
	_chatbot_status["llm"] = "unavailable"
	return None
	try:
	token = os.environ.get("HF_TOKEN", "")
	_llm_client = InferenceClient(
	provider="hf-inference",
	api_key=token if token else None,
	)
	_chatbot_status["llm"] = "loaded"
	print("[ClauseGuard Chat] HF Inference Client initialized")
	return _llm_client
	except Exception as e:
	_chatbot_status["llm"] = f"failed: {e}"
	print(f"[ClauseGuard Chat] LLM client init failed: {e}")
	return None


	def get_chatbot_status():
	"""Return human-readable chatbot status."""
	parts = []
	for name, status in _chatbot_status.items():
	icon = "✅" if status == "loaded" else "⚠️" if "failed" in status else "❌"
	label = {"embedder": "Embeddings", "llm": "LLM API"}[name]
	parts.append(f"{icon} {label}: {status}")
	return " · ".join(parts)


	# ═══════════════════════════════════════════════════════════════════════
	# TEXT CHUNKING (sentence-preserving, ~300 tokens, no overlap)
	# ═══════════════════════════════════════════════════════════════════════

	def chunk_contract_text(text, target_chunk_size=300, min_chunk_size=50):
	"""
	Split contract text into chunks for RAG retrieval.
	Sentence-preserving, ~300 tokens per chunk, 0% overlap.
	Research (arxiv 2601.14123): overlap adds cost with zero benefit.
	"""
	if not text:
	return []

	# First split on paragraph boundaries
	paragraphs = re.split(r'\n\n+', text)
	chunks = []
	current_chunk = ""

	for para in paragraphs:
	para = para.strip()
	if not para:
	continue

	# Estimate word count (rough token proxy)
	words_current = len(current_chunk.split())
	words_para = len(para.split())

	if words_current + words_para <= target_chunk_size:
	current_chunk += ("\n\n" + para if current_chunk else para)
	else:
	# Current chunk is full enough — save it
	if words_current >= min_chunk_size:
	chunks.append(current_chunk.strip())
	current_chunk = para
	else:
	# Current chunk too small — need to split the paragraph into sentences
	sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', para)
	for sent in sentences:
	words_current = len(current_chunk.split())
	words_sent = len(sent.split())
	if words_current + words_sent <= target_chunk_size:
	current_chunk += (" " + sent if current_chunk else sent)
	else:
	if words_current >= min_chunk_size:
	chunks.append(current_chunk.strip())
	current_chunk = sent

	# Don't forget the last chunk
	if current_chunk.strip() and len(current_chunk.split()) >= min_chunk_size:
	chunks.append(current_chunk.strip())

	return chunks


	# ═══════════════════════════════════════════════════════════════════════
	# EMBEDDING & RETRIEVAL
	# ═══════════════════════════════════════════════════════════════════════

	def build_embeddings(chunks):
	"""
	Embed chunks using sentence-transformers.
	Returns numpy array of shape (N, 384) or None if embedder unavailable.
	"""
	embedder = _load_embedder()
	if embedder is None or not chunks:
	return None
	try:
	embeddings = embedder.encode(
	chunks,
	normalize_embeddings=True,
	batch_size=32,
	show_progress_bar=False,
	)
	return embeddings # numpy array (N, 384)
	except Exception as e:
	print(f"[ClauseGuard Chat] Embedding error: {e}")
	return None


	def retrieve_chunks(query, chunks, embeddings, top_k=5):
	"""
	Retrieve top-k most relevant chunks for a query.
	Uses cosine similarity (embeddings are L2-normalized → dot product = cosine).
	Context budget: top-5 chunks, ≤2.5K tokens.
	"""
	embedder = _load_embedder()
	if embedder is None or embeddings is None or not chunks:
	return []

	try:
	# PERF v4.3: BGE models require query instruction prefix for retrieval
	_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
	q_emb = embedder.encode([_BGE_QUERY_PREFIX + query], normalize_embeddings=True)
	scores = (q_emb @ embeddings.T)[0]
	top_indices = np.argsort(scores)[::-1][:top_k]

	results = []
	total_words = 0
	max_words = 600 # ~2.5K tokens budget

	for idx in top_indices:
	chunk = chunks[idx]
	chunk_words = len(chunk.split())
	if total_words + chunk_words > max_words and results:
	break
	results.append({
	"text": chunk,
	"score": float(scores[idx]),
	"index": int(idx),
	})
	total_words += chunk_words

	return results
	except Exception as e:
	print(f"[ClauseGuard Chat] Retrieval error: {e}")
	return []


	# ═══════════════════════════════════════════════════════════════════════
	# SYSTEM PROMPT BUILDER
	# ═══════════════════════════════════════════════════════════════════════

	def _build_system_prompt(analysis_result, retrieved_chunks):
	"""
	Build the system prompt with:
	1. ClauseGuard analysis results (clauses, entities, risk scores) — NOT through RAG
	2. Retrieved contract chunks — through RAG
	"""
	parts = []

	parts.append("""You are ClauseGuard AI, a legal contract analysis assistant. You help users understand their contracts by answering questions based on the contract text and analysis results.

	RULES:
	- Answer ONLY based on the provided contract text and analysis. Never make up information.
	- If the answer isn't in the provided context, say "I don't see that information in the analyzed contract."
	- Cite specific clauses or sections when possible.
	- Be concise but thorough. Use plain language, not legal jargon.
	- Always end with: "⚠️ This is AI analysis, not legal advice. Consult an attorney for legal decisions."
	""")

	# Add analysis summary if available
	if analysis_result:
	risk = analysis_result.get("risk", {})
	parts.append(f"""
	═══ CONTRACT ANALYSIS SUMMARY ═══
	Risk Score: {risk.get('score', 'N/A')}/100 (Grade {risk.get('grade', 'N/A')})
	Risk Breakdown: {risk.get('breakdown', {})}
	Total Clauses Analyzed: {analysis_result.get('metadata', {}).get('total_clauses', 'N/A')}
	Flagged Clauses: {analysis_result.get('metadata', {}).get('flagged_clauses', 'N/A')}
	""")

	# Add detected clauses summary
	clauses = analysis_result.get("clauses", [])
	if clauses:
	clause_summary = []
	seen = set()
	for c in clauses:
	key = c["label"]
	if key not in seen:
	seen.add(key)
	risk_level = c.get("risk", "LOW")
	clause_summary.append(f" • [{risk_level}] {key}: {c.get('description', '')}")
	parts.append("═══ DETECTED CLAUSES ═══\n" + "\n".join(clause_summary[:20]))

	# Add entities summary
	entities = analysis_result.get("entities", [])
	if entities:
	entity_summary = []
	seen = set()
	for e in entities:
	key = f"{e['type']}: {e['text']}"
	if key not in seen and len(seen) < 15:
	seen.add(key)
	entity_summary.append(f" • {e['type']}: {e['text']}")
	parts.append("═══ EXTRACTED ENTITIES ═══\n" + "\n".join(entity_summary))

	# Add contradictions
	contradictions = analysis_result.get("contradictions", [])
	if contradictions:
	contra_summary = []
	for c in contradictions:
	contra_summary.append(f" • [{c['type']}] {c['explanation']}")
	parts.append("═══ CONTRADICTIONS / ISSUES ═══\n" + "\n".join(contra_summary))

	# Add retrieved contract text
	if retrieved_chunks:
	context_text = "\n---\n".join(c["text"] for c in retrieved_chunks)
	parts.append(f"""
	═══ RELEVANT CONTRACT TEXT (Retrieved) ═══
	{context_text}
	""")

	return "\n\n".join(parts)


	# ═══════════════════════════════════════════════════════════════════════
	# CHAT RESPONSE (Streaming)
	# ═══════════════════════════════════════════════════════════════════════

	# LLM model to use
	_LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"

	def chat_respond(message, history, chunks, embeddings, analysis_result):
	"""
	RAG chatbot response function for gr.ChatInterface.

	Args:
	message: User's question (str)
	history: Chat history (list of dicts with role/content)
	chunks: Contract text chunks (list of str)
	embeddings: Chunk embeddings (numpy array or None)
	analysis_result: Full analysis result dict (or None)

	Yields:
	Partial response string (streaming)
	"""
	# Validate inputs
	if not chunks or embeddings is None:
	yield ("⚠️ No contract loaded yet. Please upload and analyze a contract in the "
	"📄 Single Contract Analysis tab first, then come back here to ask questions.")
	return

	if not message or not message.strip():
	yield "Please ask a question about your contract."
	return

	# Step 1: Retrieve relevant chunks
	retrieved = retrieve_chunks(message, chunks, embeddings, top_k=5)

	# Step 2: Build system prompt with analysis + retrieved context
	system_prompt = _build_system_prompt(analysis_result, retrieved)

	# Step 3: Build message history for LLM
	messages = [{"role": "system", "content": system_prompt}]

	# Add recent history (last 6 turns to stay in context window)
	if history:
	for h in history[-6:]:
	messages.append({"role": h["role"], "content": h["content"]})

	messages.append({"role": "user", "content": message})

	# Step 4: Stream response from LLM
	client = _get_llm_client()
	if client is None:
	yield ("⚠️ LLM service unavailable. Please ensure `huggingface_hub` is installed "
	"and `HF_TOKEN` is set.")
	return

	try:
	stream = client.chat_completion(
	model=_LLM_MODEL,
	messages=messages,
	max_tokens=1024,
	stream=True,
	temperature=0.3, # Low temperature for factual responses
	)
	partial = ""
	for chunk in stream:
	token = chunk.choices[0].delta.content or ""
	partial += token
	yield partial
	except Exception as e:
	error_msg = str(e)
	if "rate limit" in error_msg.lower() or "429" in error_msg:
	yield ("⚠️ Rate limit reached on the free HF Inference API. "
	"Please wait a moment and try again.")
	elif "401" in error_msg or "unauthorized" in error_msg.lower():
	yield ("⚠️ Authentication error. Please set your HF_TOKEN in the Space settings.")
	else:
	yield f"⚠️ Error generating response: {error_msg}\n\nPlease try again."


	# ═══════════════════════════════════════════════════════════════════════
	# INDEXING HELPER (combines chunking + embedding)
	# ═══════════════════════════════════════════════════════════════════════

	def index_contract(text):
	"""
	Chunk and embed contract text for RAG retrieval.

	Returns: (chunks, embeddings, status_message)
	chunks: list of str
	embeddings: numpy array or None
	status_message: str
	"""
	if not text or len(text.strip()) < 50:
	return [], None, "⚠️ No contract text to index"

	chunks = chunk_contract_text(text)
	if not chunks:
	return [], None, "⚠️ Could not split contract into chunks"

	embeddings = build_embeddings(chunks)
	if embeddings is None:
	return chunks, None, "⚠️ Embedding model unavailable — chatbot will not work"

	return (
	chunks,
	embeddings,
	f"✅ Indexed {len(chunks)} chunks ({len(text)} chars) — Ready to chat!"
	)