Spaces:

davidtran999
/

hue-portal-backend-v2

Sleeping

App Files Files Community

hue-portal-backend-v2 / backend /core /rag.py

davidtran999

Upload backend/core/rag.py with huggingface_hub

765d69d verified 8 days ago

raw

history blame contribute delete

19.7 kB

	"""
	RAG (Retrieval-Augmented Generation) pipeline for answer generation.
	"""
	import re
	import unicodedata
	from typing import List, Dict, Any, Optional
	from .hybrid_search import hybrid_search
	from .models import Procedure, Fine, Office, Advisory, LegalSection
	from hue_portal.chatbot.chatbot import format_fine_amount
	from hue_portal.chatbot.llm_integration import get_llm_generator
	from hue_portal.chatbot.structured_legal import format_structured_legal_answer


	def retrieve_top_k_documents(
	query: str,
	content_type: str,
	top_k: int = 5
	) -> List[Any]:
	"""
	Retrieve top-k documents using hybrid search.

	Args:
	query: Search query.
	content_type: Type of content ('procedure', 'fine', 'office', 'advisory').
	top_k: Number of documents to retrieve.

	Returns:
	List of document objects.
	"""
	# Get appropriate queryset
	if content_type == 'procedure':
	queryset = Procedure.objects.all()
	text_fields = ['title', 'domain', 'conditions', 'dossier']
	elif content_type == 'fine':
	queryset = Fine.objects.all()
	text_fields = ['name', 'code', 'article', 'decree', 'remedial']
	elif content_type == 'office':
	queryset = Office.objects.all()
	text_fields = ['unit_name', 'address', 'district', 'service_scope']
	elif content_type == 'advisory':
	queryset = Advisory.objects.all()
	text_fields = ['title', 'summary']
	elif content_type == 'legal':
	queryset = LegalSection.objects.select_related("document").all()
	text_fields = ['section_title', 'section_code', 'content']
	else:
	return []

	# Use hybrid search with text_fields for exact match boost
	try:
	from .config.hybrid_search_config import get_config
	config = get_config(content_type)
	results = hybrid_search(
	queryset,
	query,
	top_k=top_k,
	bm25_weight=config.bm25_weight,
	vector_weight=config.vector_weight,
	min_hybrid_score=config.min_hybrid_score,
	text_fields=text_fields
	)
	return results
	except Exception as e:
	print(f"Error in retrieval: {e}")
	return []


	def generate_answer_template(
	query: str,
	documents: List[Any],
	content_type: str,
	context: Optional[List[Dict[str, Any]]] = None,
	use_llm: bool = True
	) -> str:
	"""
	Generate answer using LLM (if available) or template-based summarization.

	Args:
	query: Original query.
	documents: Retrieved documents.
	content_type: Type of content.
	context: Optional conversation context.
	use_llm: Whether to try LLM generation first.

	Returns:
	Generated answer text.
	"""
	def _invoke_llm(documents_for_prompt: List[Any]) -> Optional[str]:
	"""Call configured LLM provider safely."""
	try:
	import traceback
	from hue_portal.chatbot.llm_integration import get_llm_generator

	llm = get_llm_generator()
	if not llm:
	print("[RAG] ⚠️ LLM not available, using template", flush=True)
	return None

	print(f"[RAG] Using LLM provider: {llm.provider}", flush=True)
	llm_answer = llm.generate_answer(
	query,
	context=context,
	documents=documents_for_prompt
	)
	if llm_answer:
	print(f"[RAG] ✅ LLM answer generated (length: {len(llm_answer)})", flush=True)
	return llm_answer

	print("[RAG] ⚠️ LLM returned None, using template", flush=True)
	except Exception as exc:
	import traceback

	error_trace = traceback.format_exc()
	print(f"[RAG] ❌ LLM generation failed, using template: {exc}", flush=True)
	print(f"[RAG] ❌ Trace: {error_trace}", flush=True)
	return None

	llm_enabled = use_llm or content_type == 'general'
	if llm_enabled:
	llm_documents = documents if documents else []
	llm_answer = _invoke_llm(llm_documents)
	if llm_answer:
	return llm_answer

	# If no documents, fall back gracefully
	if not documents:
	if content_type == 'general':
	return (
	f"Tôi chưa có dữ liệu pháp luật liên quan đến '{query}', "
	"nhưng vẫn sẵn sàng trò chuyện hoặc hỗ trợ bạn ở chủ đề khác. "
	"Bạn có thể mô tả cụ thể hơn để tôi giúp tốt hơn nhé!"
	)
	return (
	f"Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}' trong cơ sở dữ liệu. "
	"Vui lòng thử lại với từ khóa khác hoặc liên hệ trực tiếp với Công an thành phố Huế để được tư vấn."
	)

	# Fallback to template-based generation
	if content_type == 'procedure':
	return _generate_procedure_answer(query, documents)
	elif content_type == 'fine':
	return _generate_fine_answer(query, documents)
	elif content_type == 'office':
	return _generate_office_answer(query, documents)
	elif content_type == 'advisory':
	return _generate_advisory_answer(query, documents)
	elif content_type == 'legal':
	return _generate_legal_answer(query, documents)
	else:
	return _generate_general_answer(query, documents)


	def _generate_procedure_answer(query: str, documents: List[Procedure]) -> str:
	"""Generate answer for procedure queries."""
	count = len(documents)
	answer = f"Tôi tìm thấy {count} thủ tục liên quan đến '{query}':\n\n"

	for i, doc in enumerate(documents[:5], 1):
	answer += f"{i}. {doc.title}\n"
	if doc.domain:
	answer += f" Lĩnh vực: {doc.domain}\n"
	if doc.level:
	answer += f" Cấp: {doc.level}\n"
	if doc.conditions:
	conditions_short = doc.conditions[:100] + "..." if len(doc.conditions) > 100 else doc.conditions
	answer += f" Điều kiện: {conditions_short}\n"
	answer += "\n"

	if count > 5:
	answer += f"... và {count - 5} thủ tục khác.\n"

	return answer


	def _generate_fine_answer(query: str, documents: List[Fine]) -> str:
	"""Generate answer for fine queries."""
	count = len(documents)
	answer = f"Tôi tìm thấy {count} mức phạt liên quan đến '{query}':\n\n"

	# Highlight best match (first result) if available
	if documents:
	best_match = documents[0]
	answer += "Kết quả chính xác nhất:\n"
	answer += f"• {best_match.name}\n"
	if best_match.code:
	answer += f" Mã vi phạm: {best_match.code}\n"

	# Format fine amount using helper function
	fine_amount = format_fine_amount(
	float(best_match.min_fine) if best_match.min_fine else None,
	float(best_match.max_fine) if best_match.max_fine else None
	)
	if fine_amount:
	answer += f" Mức phạt: {fine_amount}\n"

	if best_match.article:
	answer += f" Điều luật: {best_match.article}\n"
	answer += "\n"

	# Add other results if available
	if count > 1:
	answer += "Các mức phạt khác:\n"
	for i, doc in enumerate(documents[1:5], 2):
	answer += f"{i}. {doc.name}\n"
	if doc.code:
	answer += f" Mã vi phạm: {doc.code}\n"

	# Format fine amount
	fine_amount = format_fine_amount(
	float(doc.min_fine) if doc.min_fine else None,
	float(doc.max_fine) if doc.max_fine else None
	)
	if fine_amount:
	answer += f" Mức phạt: {fine_amount}\n"

	if doc.article:
	answer += f" Điều luật: {doc.article}\n"
	answer += "\n"
	else:
	# Fallback if no documents
	for i, doc in enumerate(documents[:5], 1):
	answer += f"{i}. {doc.name}\n"
	if doc.code:
	answer += f" Mã vi phạm: {doc.code}\n"

	# Format fine amount
	fine_amount = format_fine_amount(
	float(doc.min_fine) if doc.min_fine else None,
	float(doc.max_fine) if doc.max_fine else None
	)
	if fine_amount:
	answer += f" Mức phạt: {fine_amount}\n"

	if doc.article:
	answer += f" Điều luật: {doc.article}\n"
	answer += "\n"

	if count > 5:
	answer += f"... và {count - 5} mức phạt khác.\n"

	return answer


	def _generate_office_answer(query: str, documents: List[Office]) -> str:
	"""Generate answer for office queries."""
	count = len(documents)
	answer = f"Tôi tìm thấy {count} đơn vị liên quan đến '{query}':\n\n"

	for i, doc in enumerate(documents[:5], 1):
	answer += f"{i}. {doc.unit_name}\n"
	if doc.address:
	answer += f" Địa chỉ: {doc.address}\n"
	if doc.district:
	answer += f" Quận/Huyện: {doc.district}\n"
	if doc.phone:
	answer += f" Điện thoại: {doc.phone}\n"
	if doc.working_hours:
	answer += f" Giờ làm việc: {doc.working_hours}\n"
	answer += "\n"

	if count > 5:
	answer += f"... và {count - 5} đơn vị khác.\n"

	return answer


	def _generate_advisory_answer(query: str, documents: List[Advisory]) -> str:
	"""Generate answer for advisory queries."""
	count = len(documents)
	answer = f"Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':\n\n"

	for i, doc in enumerate(documents[:5], 1):
	answer += f"{i}. {doc.title}\n"
	if doc.summary:
	summary_short = doc.summary[:150] + "..." if len(doc.summary) > 150 else doc.summary
	answer += f" {summary_short}\n"
	answer += "\n"

	if count > 5:
	answer += f"... và {count - 5} cảnh báo khác.\n"

	return answer


	def _clean_text(value: str) -> str:
	"""Normalize whitespace and strip noise for legal snippets."""
	if not value:
	return ""
	compressed = re.sub(r"\s+", " ", value)
	return compressed.strip()


	def _summarize_section(
	section: LegalSection,
	max_sentences: int = 3,
	max_chars: int = 600
	) -> str:
	"""
	Produce a concise Vietnamese summary directly from the stored content.

	This is used as the Vietnamese prefill before calling the LLM so we avoid
	English drift and keep the answer grounded.
	"""
	content = _clean_text(section.content)
	if not content:
	return ""

	# Split by sentence boundaries; fall back to chunks if delimiters missing.
	sentences = re.split(r"(?<=[.!?])\s+", content)
	if not sentences:
	sentences = [content]

	summary_parts = []
	for sentence in sentences:
	if not sentence:
	continue
	summary_parts.append(sentence)
	joined = " ".join(summary_parts)
	if len(summary_parts) >= max_sentences or len(joined) >= max_chars:
	break

	summary = " ".join(summary_parts)
	if len(summary) > max_chars:
	summary = summary[:max_chars].rsplit(" ", 1)[0] + "..."
	return summary.strip()


	def _format_citation(section: LegalSection) -> str:
	citation = section.document.title
	if section.section_code:
	citation = f"{citation} – {section.section_code}"
	page = ""
	if section.page_start:
	page = f" (trang {section.page_start}"
	if section.page_end and section.page_end != section.page_start:
	page += f"-{section.page_end}"
	page += ")"
	return f"{citation}{page}".strip()


	def _build_legal_prefill(documents: List[LegalSection]) -> str:
	"""
	Build a compact Vietnamese summary block that will be injected into the
	Guardrails prompt. The goal is to bias the model toward Vietnamese output.
	"""
	if not documents:
	return ""

	lines = ["Bản tóm tắt tiếng Việt từ cơ sở dữ liệu:"]
	for idx, section in enumerate(documents[:3], start=1):
	summary = _summarize_section(section, max_sentences=2, max_chars=400)
	citation = _format_citation(section)
	if not summary:
	continue
	lines.append(f"{idx}. {summary} (Nguồn: {citation})")

	return "\n".join(lines)


	def _generate_legal_citation_block(documents: List[LegalSection]) -> str:
	"""Return formatted citation block reused by multiple answer modes."""
	if not documents:
	return ""

	lines: List[str] = []
	for idx, section in enumerate(documents[:5], start=1):
	summary = _summarize_section(section)
	snippet = _clean_text(section.content)[:350]
	if snippet and len(snippet) == 350:
	snippet = snippet.rsplit(" ", 1)[0] + "..."
	citation = _format_citation(section)

	lines.append(f"{idx}. {section.section_title or 'Nội dung'} – {citation}")
	if summary:
	lines.append(f" - Tóm tắt: {summary}")
	if snippet:
	lines.append(f" - Trích dẫn: \"{snippet}\"")
	lines.append("")

	if len(documents) > 5:
	lines.append(f"... và {len(documents) - 5} trích đoạn khác trong cùng nguồn dữ liệu.")

	return "\n".join(lines).strip()


	def _generate_legal_answer(query: str, documents: List[LegalSection]) -> str:
	count = len(documents)
	if count == 0:
	return (
	f"Tôi chưa tìm thấy trích dẫn pháp lý nào cho '{query}'. "
	"Bạn có thể cung cấp thêm ngữ cảnh để tôi tiếp tục hỗ trợ."
	)

	header = (
	f"Tôi đã tổng hợp {count} trích đoạn pháp lý liên quan đến '{query}'. "
	"Đây là bản tóm tắt tiếng Việt kèm trích dẫn:"
	)
	citation_block = _generate_legal_citation_block(documents)
	return f"{header}\n\n{citation_block}".strip()


	def _generate_general_answer(query: str, documents: List[Any]) -> str:
	"""Generate general answer."""
	count = len(documents)
	return f"Tôi tìm thấy {count} kết quả liên quan đến '{query}'. Vui lòng xem chi tiết bên dưới."


	def _strip_accents(value: str) -> str:
	return "".join(
	char for char in unicodedata.normalize("NFD", value)
	if unicodedata.category(char) != "Mn"
	)


	def _contains_markers(
	text_with_accents: str,
	text_without_accents: str,
	markers: List[str]
	) -> bool:
	for marker in markers:
	marker_lower = marker.lower()
	marker_no_accents = _strip_accents(marker_lower)
	if marker_lower in text_with_accents or marker_no_accents in text_without_accents:
	return True
	return False


	def _is_valid_legal_answer(answer: str, documents: List[LegalSection]) -> bool:
	"""
	Validate that the LLM answer for legal intent references actual legal content.

	Criteria:
	- Must not contain denial phrases (already handled earlier) or "xin lỗi".
	- Must not introduce obvious monetary values (legal documents không có số tiền phạt).
	- Must have tối thiểu 40 ký tự để tránh câu trả lời quá ngắn.
	"""
	if not answer:
	return False

	normalized_answer = answer.lower()
	normalized_answer_no_accents = _strip_accents(normalized_answer)

	denial_markers = [
	"xin lỗi",
	"thông tin trong cơ sở dữ liệu chưa đủ",
	"không thể giúp",
	"không tìm thấy thông tin",
	"không có dữ liệu",
	]
	if _contains_markers(normalized_answer, normalized_answer_no_accents, denial_markers):
	return False

	money_markers = ["vnđ", "vnd", "đồng", "đ", "dong"]
	if _contains_markers(normalized_answer, normalized_answer_no_accents, money_markers):
	return False

	if len(answer.strip()) < 40:
	return False

	return True


	def rag_pipeline(
	query: str,
	intent: str,
	top_k: int = 5,
	min_confidence: float = 0.3,
	context: Optional[List[Dict[str, Any]]] = None,
	use_llm: bool = True
	) -> Dict[str, Any]:
	"""
	Complete RAG pipeline: retrieval + answer generation.

	Args:
	query: User query.
	intent: Detected intent.
	top_k: Number of documents to retrieve.
	min_confidence: Minimum confidence threshold.
	context: Optional conversation context.
	use_llm: Whether to use LLM for answer generation.

	Returns:
	Dictionary with 'answer', 'documents', 'count', 'confidence', 'content_type'.
	"""
	# Map intent to content type
	intent_to_type = {
	'search_procedure': 'procedure',
	'search_fine': 'fine',
	'search_office': 'office',
	'search_advisory': 'advisory',
	'search_legal': 'legal',
	'general_query': 'general',
	'greeting': 'general',
	}

	content_type = intent_to_type.get(intent, 'procedure')

	# Retrieve documents
	documents = retrieve_top_k_documents(query, content_type, top_k=top_k)

	# Enable LLM automatically for casual conversation intents
	llm_allowed = use_llm or intent in {"general_query", "greeting"}

	structured_used = False
	answer: Optional[str] = None

	if intent == "search_legal" and documents:
	llm = get_llm_generator()
	if llm:
	prefill_summary = _build_legal_prefill(documents)
	structured = llm.generate_structured_legal_answer(
	query,
	documents,
	prefill_summary=prefill_summary,
	)
	if structured:
	answer = format_structured_legal_answer(structured)
	structured_used = True
	citation_block = _generate_legal_citation_block(documents)
	if citation_block:
	answer = (
	f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}"
	)

	if answer is None:
	answer = generate_answer_template(
	query,
	documents,
	content_type,
	context=context,
	use_llm=llm_allowed
	)

	# Fallback nếu intent pháp luật nhưng câu LLM không đạt tiêu chí
	if (
	intent == "search_legal"
	and documents
	and isinstance(answer, str)
	and not structured_used
	):
	if not _is_valid_legal_answer(answer, documents):
	print("[RAG] ⚠️ Fallback: invalid legal answer detected", flush=True)
	answer = _generate_legal_answer(query, documents)
	else:
	citation_block = _generate_legal_answer(query, documents)
	if citation_block.strip():
	answer = f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}"

	# Calculate confidence (simple: based on number of results and scores)
	confidence = min(1.0, len(documents) / top_k)
	if documents and hasattr(documents[0], '_hybrid_score'):
	confidence = max(confidence, documents[0]._hybrid_score)

	return {
	'answer': answer,
	'documents': documents,
	'count': len(documents),
	'confidence': confidence,
	'content_type': content_type
	}