Spaces:
Sleeping
Sleeping
| """ | |
| RAG (Retrieval-Augmented Generation) pipeline for answer generation. | |
| """ | |
| import re | |
| import unicodedata | |
| from typing import List, Dict, Any, Optional | |
| from .hybrid_search import hybrid_search | |
| from .models import Procedure, Fine, Office, Advisory, LegalSection | |
| from hue_portal.chatbot.chatbot import format_fine_amount | |
| from hue_portal.chatbot.llm_integration import get_llm_generator | |
| from hue_portal.chatbot.structured_legal import format_structured_legal_answer | |
| def retrieve_top_k_documents( | |
| query: str, | |
| content_type: str, | |
| top_k: int = 5 | |
| ) -> List[Any]: | |
| """ | |
| Retrieve top-k documents using hybrid search. | |
| Args: | |
| query: Search query. | |
| content_type: Type of content ('procedure', 'fine', 'office', 'advisory'). | |
| top_k: Number of documents to retrieve. | |
| Returns: | |
| List of document objects. | |
| """ | |
| # Get appropriate queryset | |
| if content_type == 'procedure': | |
| queryset = Procedure.objects.all() | |
| text_fields = ['title', 'domain', 'conditions', 'dossier'] | |
| elif content_type == 'fine': | |
| queryset = Fine.objects.all() | |
| text_fields = ['name', 'code', 'article', 'decree', 'remedial'] | |
| elif content_type == 'office': | |
| queryset = Office.objects.all() | |
| text_fields = ['unit_name', 'address', 'district', 'service_scope'] | |
| elif content_type == 'advisory': | |
| queryset = Advisory.objects.all() | |
| text_fields = ['title', 'summary'] | |
| elif content_type == 'legal': | |
| queryset = LegalSection.objects.select_related("document").all() | |
| text_fields = ['section_title', 'section_code', 'content'] | |
| else: | |
| return [] | |
| # Use hybrid search with text_fields for exact match boost | |
| try: | |
| from .config.hybrid_search_config import get_config | |
| config = get_config(content_type) | |
| results = hybrid_search( | |
| queryset, | |
| query, | |
| top_k=top_k, | |
| bm25_weight=config.bm25_weight, | |
| vector_weight=config.vector_weight, | |
| min_hybrid_score=config.min_hybrid_score, | |
| text_fields=text_fields | |
| ) | |
| return results | |
| except Exception as e: | |
| print(f"Error in retrieval: {e}") | |
| return [] | |
| def generate_answer_template( | |
| query: str, | |
| documents: List[Any], | |
| content_type: str, | |
| context: Optional[List[Dict[str, Any]]] = None, | |
| use_llm: bool = True | |
| ) -> str: | |
| """ | |
| Generate answer using LLM (if available) or template-based summarization. | |
| Args: | |
| query: Original query. | |
| documents: Retrieved documents. | |
| content_type: Type of content. | |
| context: Optional conversation context. | |
| use_llm: Whether to try LLM generation first. | |
| Returns: | |
| Generated answer text. | |
| """ | |
| def _invoke_llm(documents_for_prompt: List[Any]) -> Optional[str]: | |
| """Call configured LLM provider safely.""" | |
| try: | |
| import traceback | |
| from hue_portal.chatbot.llm_integration import get_llm_generator | |
| llm = get_llm_generator() | |
| if not llm: | |
| print("[RAG] ⚠️ LLM not available, using template", flush=True) | |
| return None | |
| print(f"[RAG] Using LLM provider: {llm.provider}", flush=True) | |
| llm_answer = llm.generate_answer( | |
| query, | |
| context=context, | |
| documents=documents_for_prompt | |
| ) | |
| if llm_answer: | |
| print(f"[RAG] ✅ LLM answer generated (length: {len(llm_answer)})", flush=True) | |
| return llm_answer | |
| print("[RAG] ⚠️ LLM returned None, using template", flush=True) | |
| except Exception as exc: | |
| import traceback | |
| error_trace = traceback.format_exc() | |
| print(f"[RAG] ❌ LLM generation failed, using template: {exc}", flush=True) | |
| print(f"[RAG] ❌ Trace: {error_trace}", flush=True) | |
| return None | |
| llm_enabled = use_llm or content_type == 'general' | |
| if llm_enabled: | |
| llm_documents = documents if documents else [] | |
| llm_answer = _invoke_llm(llm_documents) | |
| if llm_answer: | |
| return llm_answer | |
| # If no documents, fall back gracefully | |
| if not documents: | |
| if content_type == 'general': | |
| return ( | |
| f"Tôi chưa có dữ liệu pháp luật liên quan đến '{query}', " | |
| "nhưng vẫn sẵn sàng trò chuyện hoặc hỗ trợ bạn ở chủ đề khác. " | |
| "Bạn có thể mô tả cụ thể hơn để tôi giúp tốt hơn nhé!" | |
| ) | |
| return ( | |
| f"Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}' trong cơ sở dữ liệu. " | |
| "Vui lòng thử lại với từ khóa khác hoặc liên hệ trực tiếp với Công an thành phố Huế để được tư vấn." | |
| ) | |
| # Fallback to template-based generation | |
| if content_type == 'procedure': | |
| return _generate_procedure_answer(query, documents) | |
| elif content_type == 'fine': | |
| return _generate_fine_answer(query, documents) | |
| elif content_type == 'office': | |
| return _generate_office_answer(query, documents) | |
| elif content_type == 'advisory': | |
| return _generate_advisory_answer(query, documents) | |
| elif content_type == 'legal': | |
| return _generate_legal_answer(query, documents) | |
| else: | |
| return _generate_general_answer(query, documents) | |
| def _generate_procedure_answer(query: str, documents: List[Procedure]) -> str: | |
| """Generate answer for procedure queries.""" | |
| count = len(documents) | |
| answer = f"Tôi tìm thấy {count} thủ tục liên quan đến '{query}':\n\n" | |
| for i, doc in enumerate(documents[:5], 1): | |
| answer += f"{i}. {doc.title}\n" | |
| if doc.domain: | |
| answer += f" Lĩnh vực: {doc.domain}\n" | |
| if doc.level: | |
| answer += f" Cấp: {doc.level}\n" | |
| if doc.conditions: | |
| conditions_short = doc.conditions[:100] + "..." if len(doc.conditions) > 100 else doc.conditions | |
| answer += f" Điều kiện: {conditions_short}\n" | |
| answer += "\n" | |
| if count > 5: | |
| answer += f"... và {count - 5} thủ tục khác.\n" | |
| return answer | |
| def _generate_fine_answer(query: str, documents: List[Fine]) -> str: | |
| """Generate answer for fine queries.""" | |
| count = len(documents) | |
| answer = f"Tôi tìm thấy {count} mức phạt liên quan đến '{query}':\n\n" | |
| # Highlight best match (first result) if available | |
| if documents: | |
| best_match = documents[0] | |
| answer += "Kết quả chính xác nhất:\n" | |
| answer += f"• {best_match.name}\n" | |
| if best_match.code: | |
| answer += f" Mã vi phạm: {best_match.code}\n" | |
| # Format fine amount using helper function | |
| fine_amount = format_fine_amount( | |
| float(best_match.min_fine) if best_match.min_fine else None, | |
| float(best_match.max_fine) if best_match.max_fine else None | |
| ) | |
| if fine_amount: | |
| answer += f" Mức phạt: {fine_amount}\n" | |
| if best_match.article: | |
| answer += f" Điều luật: {best_match.article}\n" | |
| answer += "\n" | |
| # Add other results if available | |
| if count > 1: | |
| answer += "Các mức phạt khác:\n" | |
| for i, doc in enumerate(documents[1:5], 2): | |
| answer += f"{i}. {doc.name}\n" | |
| if doc.code: | |
| answer += f" Mã vi phạm: {doc.code}\n" | |
| # Format fine amount | |
| fine_amount = format_fine_amount( | |
| float(doc.min_fine) if doc.min_fine else None, | |
| float(doc.max_fine) if doc.max_fine else None | |
| ) | |
| if fine_amount: | |
| answer += f" Mức phạt: {fine_amount}\n" | |
| if doc.article: | |
| answer += f" Điều luật: {doc.article}\n" | |
| answer += "\n" | |
| else: | |
| # Fallback if no documents | |
| for i, doc in enumerate(documents[:5], 1): | |
| answer += f"{i}. {doc.name}\n" | |
| if doc.code: | |
| answer += f" Mã vi phạm: {doc.code}\n" | |
| # Format fine amount | |
| fine_amount = format_fine_amount( | |
| float(doc.min_fine) if doc.min_fine else None, | |
| float(doc.max_fine) if doc.max_fine else None | |
| ) | |
| if fine_amount: | |
| answer += f" Mức phạt: {fine_amount}\n" | |
| if doc.article: | |
| answer += f" Điều luật: {doc.article}\n" | |
| answer += "\n" | |
| if count > 5: | |
| answer += f"... và {count - 5} mức phạt khác.\n" | |
| return answer | |
| def _generate_office_answer(query: str, documents: List[Office]) -> str: | |
| """Generate answer for office queries.""" | |
| count = len(documents) | |
| answer = f"Tôi tìm thấy {count} đơn vị liên quan đến '{query}':\n\n" | |
| for i, doc in enumerate(documents[:5], 1): | |
| answer += f"{i}. {doc.unit_name}\n" | |
| if doc.address: | |
| answer += f" Địa chỉ: {doc.address}\n" | |
| if doc.district: | |
| answer += f" Quận/Huyện: {doc.district}\n" | |
| if doc.phone: | |
| answer += f" Điện thoại: {doc.phone}\n" | |
| if doc.working_hours: | |
| answer += f" Giờ làm việc: {doc.working_hours}\n" | |
| answer += "\n" | |
| if count > 5: | |
| answer += f"... và {count - 5} đơn vị khác.\n" | |
| return answer | |
| def _generate_advisory_answer(query: str, documents: List[Advisory]) -> str: | |
| """Generate answer for advisory queries.""" | |
| count = len(documents) | |
| answer = f"Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':\n\n" | |
| for i, doc in enumerate(documents[:5], 1): | |
| answer += f"{i}. {doc.title}\n" | |
| if doc.summary: | |
| summary_short = doc.summary[:150] + "..." if len(doc.summary) > 150 else doc.summary | |
| answer += f" {summary_short}\n" | |
| answer += "\n" | |
| if count > 5: | |
| answer += f"... và {count - 5} cảnh báo khác.\n" | |
| return answer | |
| def _clean_text(value: str) -> str: | |
| """Normalize whitespace and strip noise for legal snippets.""" | |
| if not value: | |
| return "" | |
| compressed = re.sub(r"\s+", " ", value) | |
| return compressed.strip() | |
| def _summarize_section( | |
| section: LegalSection, | |
| max_sentences: int = 3, | |
| max_chars: int = 600 | |
| ) -> str: | |
| """ | |
| Produce a concise Vietnamese summary directly from the stored content. | |
| This is used as the Vietnamese prefill before calling the LLM so we avoid | |
| English drift and keep the answer grounded. | |
| """ | |
| content = _clean_text(section.content) | |
| if not content: | |
| return "" | |
| # Split by sentence boundaries; fall back to chunks if delimiters missing. | |
| sentences = re.split(r"(?<=[.!?])\s+", content) | |
| if not sentences: | |
| sentences = [content] | |
| summary_parts = [] | |
| for sentence in sentences: | |
| if not sentence: | |
| continue | |
| summary_parts.append(sentence) | |
| joined = " ".join(summary_parts) | |
| if len(summary_parts) >= max_sentences or len(joined) >= max_chars: | |
| break | |
| summary = " ".join(summary_parts) | |
| if len(summary) > max_chars: | |
| summary = summary[:max_chars].rsplit(" ", 1)[0] + "..." | |
| return summary.strip() | |
| def _format_citation(section: LegalSection) -> str: | |
| citation = section.document.title | |
| if section.section_code: | |
| citation = f"{citation} – {section.section_code}" | |
| page = "" | |
| if section.page_start: | |
| page = f" (trang {section.page_start}" | |
| if section.page_end and section.page_end != section.page_start: | |
| page += f"-{section.page_end}" | |
| page += ")" | |
| return f"{citation}{page}".strip() | |
| def _build_legal_prefill(documents: List[LegalSection]) -> str: | |
| """ | |
| Build a compact Vietnamese summary block that will be injected into the | |
| Guardrails prompt. The goal is to bias the model toward Vietnamese output. | |
| """ | |
| if not documents: | |
| return "" | |
| lines = ["Bản tóm tắt tiếng Việt từ cơ sở dữ liệu:"] | |
| for idx, section in enumerate(documents[:3], start=1): | |
| summary = _summarize_section(section, max_sentences=2, max_chars=400) | |
| citation = _format_citation(section) | |
| if not summary: | |
| continue | |
| lines.append(f"{idx}. {summary} (Nguồn: {citation})") | |
| return "\n".join(lines) | |
| def _generate_legal_citation_block(documents: List[LegalSection]) -> str: | |
| """Return formatted citation block reused by multiple answer modes.""" | |
| if not documents: | |
| return "" | |
| lines: List[str] = [] | |
| for idx, section in enumerate(documents[:5], start=1): | |
| summary = _summarize_section(section) | |
| snippet = _clean_text(section.content)[:350] | |
| if snippet and len(snippet) == 350: | |
| snippet = snippet.rsplit(" ", 1)[0] + "..." | |
| citation = _format_citation(section) | |
| lines.append(f"{idx}. {section.section_title or 'Nội dung'} – {citation}") | |
| if summary: | |
| lines.append(f" - Tóm tắt: {summary}") | |
| if snippet: | |
| lines.append(f" - Trích dẫn: \"{snippet}\"") | |
| lines.append("") | |
| if len(documents) > 5: | |
| lines.append(f"... và {len(documents) - 5} trích đoạn khác trong cùng nguồn dữ liệu.") | |
| return "\n".join(lines).strip() | |
| def _generate_legal_answer(query: str, documents: List[LegalSection]) -> str: | |
| count = len(documents) | |
| if count == 0: | |
| return ( | |
| f"Tôi chưa tìm thấy trích dẫn pháp lý nào cho '{query}'. " | |
| "Bạn có thể cung cấp thêm ngữ cảnh để tôi tiếp tục hỗ trợ." | |
| ) | |
| header = ( | |
| f"Tôi đã tổng hợp {count} trích đoạn pháp lý liên quan đến '{query}'. " | |
| "Đây là bản tóm tắt tiếng Việt kèm trích dẫn:" | |
| ) | |
| citation_block = _generate_legal_citation_block(documents) | |
| return f"{header}\n\n{citation_block}".strip() | |
| def _generate_general_answer(query: str, documents: List[Any]) -> str: | |
| """Generate general answer.""" | |
| count = len(documents) | |
| return f"Tôi tìm thấy {count} kết quả liên quan đến '{query}'. Vui lòng xem chi tiết bên dưới." | |
| def _strip_accents(value: str) -> str: | |
| return "".join( | |
| char for char in unicodedata.normalize("NFD", value) | |
| if unicodedata.category(char) != "Mn" | |
| ) | |
| def _contains_markers( | |
| text_with_accents: str, | |
| text_without_accents: str, | |
| markers: List[str] | |
| ) -> bool: | |
| for marker in markers: | |
| marker_lower = marker.lower() | |
| marker_no_accents = _strip_accents(marker_lower) | |
| if marker_lower in text_with_accents or marker_no_accents in text_without_accents: | |
| return True | |
| return False | |
| def _is_valid_legal_answer(answer: str, documents: List[LegalSection]) -> bool: | |
| """ | |
| Validate that the LLM answer for legal intent references actual legal content. | |
| Criteria: | |
| - Must not contain denial phrases (already handled earlier) or "xin lỗi". | |
| - Must not introduce obvious monetary values (legal documents không có số tiền phạt). | |
| - Must have tối thiểu 40 ký tự để tránh câu trả lời quá ngắn. | |
| """ | |
| if not answer: | |
| return False | |
| normalized_answer = answer.lower() | |
| normalized_answer_no_accents = _strip_accents(normalized_answer) | |
| denial_markers = [ | |
| "xin lỗi", | |
| "thông tin trong cơ sở dữ liệu chưa đủ", | |
| "không thể giúp", | |
| "không tìm thấy thông tin", | |
| "không có dữ liệu", | |
| ] | |
| if _contains_markers(normalized_answer, normalized_answer_no_accents, denial_markers): | |
| return False | |
| money_markers = ["vnđ", "vnd", "đồng", "đ", "dong"] | |
| if _contains_markers(normalized_answer, normalized_answer_no_accents, money_markers): | |
| return False | |
| if len(answer.strip()) < 40: | |
| return False | |
| return True | |
| def rag_pipeline( | |
| query: str, | |
| intent: str, | |
| top_k: int = 5, | |
| min_confidence: float = 0.3, | |
| context: Optional[List[Dict[str, Any]]] = None, | |
| use_llm: bool = True | |
| ) -> Dict[str, Any]: | |
| """ | |
| Complete RAG pipeline: retrieval + answer generation. | |
| Args: | |
| query: User query. | |
| intent: Detected intent. | |
| top_k: Number of documents to retrieve. | |
| min_confidence: Minimum confidence threshold. | |
| context: Optional conversation context. | |
| use_llm: Whether to use LLM for answer generation. | |
| Returns: | |
| Dictionary with 'answer', 'documents', 'count', 'confidence', 'content_type'. | |
| """ | |
| # Map intent to content type | |
| intent_to_type = { | |
| 'search_procedure': 'procedure', | |
| 'search_fine': 'fine', | |
| 'search_office': 'office', | |
| 'search_advisory': 'advisory', | |
| 'search_legal': 'legal', | |
| 'general_query': 'general', | |
| 'greeting': 'general', | |
| } | |
| content_type = intent_to_type.get(intent, 'procedure') | |
| # Retrieve documents | |
| documents = retrieve_top_k_documents(query, content_type, top_k=top_k) | |
| # Enable LLM automatically for casual conversation intents | |
| llm_allowed = use_llm or intent in {"general_query", "greeting"} | |
| structured_used = False | |
| answer: Optional[str] = None | |
| if intent == "search_legal" and documents: | |
| llm = get_llm_generator() | |
| if llm: | |
| prefill_summary = _build_legal_prefill(documents) | |
| structured = llm.generate_structured_legal_answer( | |
| query, | |
| documents, | |
| prefill_summary=prefill_summary, | |
| ) | |
| if structured: | |
| answer = format_structured_legal_answer(structured) | |
| structured_used = True | |
| citation_block = _generate_legal_citation_block(documents) | |
| if citation_block: | |
| answer = ( | |
| f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}" | |
| ) | |
| if answer is None: | |
| answer = generate_answer_template( | |
| query, | |
| documents, | |
| content_type, | |
| context=context, | |
| use_llm=llm_allowed | |
| ) | |
| # Fallback nếu intent pháp luật nhưng câu LLM không đạt tiêu chí | |
| if ( | |
| intent == "search_legal" | |
| and documents | |
| and isinstance(answer, str) | |
| and not structured_used | |
| ): | |
| if not _is_valid_legal_answer(answer, documents): | |
| print("[RAG] ⚠️ Fallback: invalid legal answer detected", flush=True) | |
| answer = _generate_legal_answer(query, documents) | |
| else: | |
| citation_block = _generate_legal_answer(query, documents) | |
| if citation_block.strip(): | |
| answer = f"{answer.rstrip()}\n\nTrích dẫn chi tiết:\n{citation_block}" | |
| # Calculate confidence (simple: based on number of results and scores) | |
| confidence = min(1.0, len(documents) / top_k) | |
| if documents and hasattr(documents[0], '_hybrid_score'): | |
| confidence = max(confidence, documents[0]._hybrid_score) | |
| return { | |
| 'answer': answer, | |
| 'documents': documents, | |
| 'count': len(documents), | |
| 'confidence': confidence, | |
| 'content_type': content_type | |
| } | |