Spaces:
Sleeping
Sleeping
| """ | |
| Query expansion and paraphrasing utilities for improving search recall. | |
| """ | |
| import re | |
| import unicodedata | |
| from typing import List, Dict, Any, Optional, Set | |
| from hue_portal.core.models import Synonym | |
| from hue_portal.core.search_ml import expand_query_with_synonyms | |
| def normalize_vietnamese_query(query: str) -> str: | |
| """ | |
| Normalize Vietnamese text by handling diacritics variants. | |
| Args: | |
| query: Input query string. | |
| Returns: | |
| Normalized query string. | |
| """ | |
| if not query: | |
| return "" | |
| # Remove extra spaces | |
| query = re.sub(r'\s+', ' ', query.strip()) | |
| # Lowercase | |
| query = query.lower() | |
| return query | |
| def extract_key_phrases(query: str) -> List[str]: | |
| """ | |
| Extract key phrases from query. | |
| Args: | |
| query: Input query string. | |
| Returns: | |
| List of key phrases. | |
| """ | |
| if not query: | |
| return [] | |
| # Remove common stopwords | |
| stopwords = { | |
| "là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc", | |
| "tôi", "bạn", "có", "không", "được", "một", "các", "với", "cho" | |
| } | |
| # Split into words | |
| words = re.findall(r'\b\w+\b', query.lower()) | |
| # Filter stopwords and short words | |
| key_words = [w for w in words if w not in stopwords and len(w) > 2] | |
| # Extract bigrams (2-word phrases) | |
| phrases = [] | |
| for i in range(len(key_words) - 1): | |
| phrase = f"{key_words[i]} {key_words[i+1]}" | |
| phrases.append(phrase) | |
| # Combine single words and phrases | |
| all_phrases = key_words + phrases | |
| return all_phrases | |
| def expand_query_semantically(query: str, context: Optional[Dict[str, Any]] = None) -> List[str]: | |
| """ | |
| Expand query with synonyms and related terms. | |
| Args: | |
| query: Original query string. | |
| context: Optional context dictionary with entities, intents, etc. | |
| Returns: | |
| List of expanded query variations. | |
| """ | |
| expanded = [query] | |
| # Use existing synonym expansion | |
| synonym_expanded = expand_query_with_synonyms(query) | |
| expanded.extend(synonym_expanded) | |
| # Add context-based expansions | |
| if context: | |
| entities = context.get("entities", {}) | |
| # If fine_code in context, add fine name variations | |
| if "fine_code" in entities: | |
| fine_code = entities["fine_code"] | |
| # Could look up fine name from database and add variations | |
| expanded.append(f"{query} {fine_code}") | |
| # If procedure_name in context, add procedure variations | |
| if "procedure_name" in entities: | |
| procedure_name = entities["procedure_name"] | |
| expanded.append(f"{query} {procedure_name}") | |
| # Add common Vietnamese variations | |
| variations = _get_vietnamese_variations(query) | |
| expanded.extend(variations) | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_expanded = [] | |
| for q in expanded: | |
| q_normalized = normalize_vietnamese_query(q) | |
| if q_normalized not in seen: | |
| seen.add(q_normalized) | |
| unique_expanded.append(q) | |
| return unique_expanded | |
| def _get_vietnamese_variations(query: str) -> List[str]: | |
| """ | |
| Get common Vietnamese query variations. | |
| Args: | |
| query: Input query. | |
| Returns: | |
| List of variations. | |
| """ | |
| variations = [] | |
| query_lower = query.lower() | |
| # Common synonym mappings | |
| synonym_map = { | |
| "mức phạt": ["tiền phạt", "phạt", "xử phạt"], | |
| "thủ tục": ["hồ sơ", "giấy tờ", "quy trình"], | |
| "địa chỉ": ["nơi", "chỗ", "điểm"], | |
| "số điện thoại": ["điện thoại", "số liên hệ", "hotline"], | |
| "giờ làm việc": ["thời gian", "giờ", "lịch làm việc"], | |
| "cảnh báo": ["thông báo", "lưu ý", "chú ý"], | |
| "lừa đảo": ["scam", "gian lận", "lừa"], | |
| } | |
| for key, synonyms in synonym_map.items(): | |
| if key in query_lower: | |
| for synonym in synonyms: | |
| variation = query_lower.replace(key, synonym) | |
| if variation != query_lower: | |
| variations.append(variation) | |
| return variations | |
| def paraphrase_query(query: str) -> List[str]: | |
| """ | |
| Generate paraphrases of the query to increase recall. | |
| Args: | |
| query: Original query string. | |
| Returns: | |
| List of paraphrased queries. | |
| """ | |
| paraphrases = [query] | |
| query_lower = query.lower() | |
| # Common paraphrasing patterns for Vietnamese | |
| patterns = [ | |
| # Question variations | |
| (r"mức phạt (.+) là bao nhiêu", r"phạt \1 bao nhiêu tiền"), | |
| (r"thủ tục (.+) cần gì", r"làm thủ tục \1 cần giấy tờ gì"), | |
| (r"địa chỉ (.+) ở đâu", r"\1 ở đâu"), | |
| (r"(.+) như thế nào", r"cách \1"), | |
| ] | |
| for pattern, replacement in patterns: | |
| if re.search(pattern, query_lower): | |
| paraphrase = re.sub(pattern, replacement, query_lower) | |
| if paraphrase != query_lower: | |
| paraphrases.append(paraphrase) | |
| # Add question word variations | |
| if "bao nhiêu" in query_lower: | |
| paraphrases.append(query_lower.replace("bao nhiêu", "mức")) | |
| paraphrases.append(query_lower.replace("bao nhiêu", "giá")) | |
| if "như thế nào" in query_lower: | |
| paraphrases.append(query_lower.replace("như thế nào", "cách")) | |
| paraphrases.append(query_lower.replace("như thế nào", "quy trình")) | |
| # Remove duplicates | |
| return list(dict.fromkeys(paraphrases)) | |
| def enhance_query_with_context(query: str, context: Optional[Dict[str, Any]] = None) -> str: | |
| """ | |
| Enhance query with context information. | |
| Args: | |
| query: Original query string. | |
| context: Optional context dictionary. | |
| Returns: | |
| Enhanced query string. | |
| """ | |
| if not context: | |
| return query | |
| enhanced_parts = [query] | |
| # Add entities from context | |
| entities = context.get("entities", {}) | |
| if "fine_code" in entities: | |
| enhanced_parts.append(entities["fine_code"]) | |
| if "procedure_name" in entities: | |
| enhanced_parts.append(entities["procedure_name"]) | |
| if "office_name" in entities: | |
| enhanced_parts.append(entities["office_name"]) | |
| # Add intent-based keywords | |
| intent = context.get("intent", "") | |
| if intent == "search_fine": | |
| enhanced_parts.append("mức phạt vi phạm") | |
| elif intent == "search_procedure": | |
| enhanced_parts.append("thủ tục hành chính") | |
| elif intent == "search_office": | |
| enhanced_parts.append("đơn vị công an") | |
| return " ".join(enhanced_parts) | |