| """
|
| Language Detection Module.
|
|
|
| Provides multi-language detection for:
|
| - English (en)
|
| - Hindi (hi)
|
| - Hinglish (code-mixed Hindi and English)
|
|
|
| Uses langdetect library with custom Hinglish detection logic.
|
| Performance target: <100ms per detection.
|
| """
|
|
|
| import time
|
| from typing import Tuple, Optional
|
|
|
| import langdetect
|
| from langdetect import detect_langs, DetectorFactory
|
| from langdetect.lang_detect_exception import LangDetectException
|
|
|
| from app.utils.logger import get_logger
|
|
|
| logger = get_logger(__name__)
|
|
|
|
|
| DetectorFactory.seed = 0
|
|
|
|
|
| SUPPORTED_LANGUAGES = {"en", "hi", "hinglish"}
|
|
|
|
|
| DEFAULT_LANGUAGE = "en"
|
| DEFAULT_CONFIDENCE = 0.3
|
| ERROR_CONFIDENCE = 0.3
|
|
|
|
|
| HINGLISH_MIN_RATIO = 0.1
|
|
|
|
|
| class LanguageDetector:
|
| """
|
| Language detection for English, Hindi, and Hinglish.
|
|
|
| Uses langdetect library with custom Hinglish detection logic.
|
| Thread-safe with deterministic results.
|
|
|
| Attributes:
|
| _initialized: Flag indicating successful initialization
|
| """
|
|
|
| def __init__(self) -> None:
|
| """
|
| Initialize the LanguageDetector.
|
|
|
| Sets the seed for reproducible results.
|
| """
|
| self._initialized = False
|
| try:
|
|
|
| DetectorFactory.seed = 0
|
| self._initialized = True
|
| logger.debug("LanguageDetector initialized successfully")
|
| except Exception as e:
|
| logger.error(f"Failed to initialize LanguageDetector: {e}")
|
| self._initialized = False
|
|
|
| def detect(self, text: str) -> Tuple[str, float]:
|
| """
|
| Detect the language of input text.
|
|
|
| Args:
|
| text: Input text to analyze
|
|
|
| Returns:
|
| Tuple of (language_code, confidence)
|
| language_code: 'en', 'hi', or 'hinglish'
|
| confidence: 0.0-1.0
|
|
|
| Raises:
|
| No exceptions - returns fallback on error
|
| """
|
| return detect_language(text)
|
|
|
| def is_hinglish(self, text: str) -> bool:
|
| """
|
| Check if text is Hinglish (code-mixed).
|
|
|
| Hinglish is detected when text contains both:
|
| - Devanagari characters (Hindi script)
|
| - Latin characters (English script)
|
|
|
| Args:
|
| text: Input text
|
|
|
| Returns:
|
| True if text contains both Devanagari and Latin characters
|
| """
|
| return has_devanagari(text) and has_latin(text)
|
|
|
| def get_script_ratios(self, text: str) -> dict:
|
| """
|
| Calculate the ratio of different scripts in text.
|
|
|
| Args:
|
| text: Input text
|
|
|
| Returns:
|
| Dictionary with ratios for each script type
|
| """
|
| if not text:
|
| return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
|
|
|
| total_chars = len(text)
|
| devanagari_count = sum(1 for char in text if is_devanagari_char(char))
|
| latin_count = sum(1 for char in text if is_latin_char(char))
|
| other_count = total_chars - devanagari_count - latin_count
|
|
|
| return {
|
| "devanagari": devanagari_count / total_chars,
|
| "latin": latin_count / total_chars,
|
| "other": other_count / total_chars,
|
| }
|
|
|
|
|
| def detect_language(text: str) -> Tuple[str, float]:
|
| """
|
| Detect language of text.
|
|
|
| Detection priority:
|
| 1. Check for Hinglish (mixed scripts) first
|
| 2. Use langdetect for primary detection
|
| 3. Fallback to character-based detection if langdetect fails
|
| 4. Default to English with low confidence on error
|
|
|
| Args:
|
| text: Input message
|
|
|
| Returns:
|
| Tuple of (language_code, confidence)
|
| language_code: 'en', 'hi', or 'hinglish'
|
| confidence: 0.0-1.0
|
| """
|
| start_time = time.time()
|
|
|
|
|
| if not text or not text.strip():
|
| logger.debug("Empty text provided, returning default")
|
| return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
|
|
|
| text = text.strip()
|
|
|
| try:
|
|
|
|
|
| has_dev = has_devanagari(text)
|
| has_lat = has_latin(text)
|
|
|
| if has_dev and has_lat:
|
|
|
| ratios = _get_script_ratios(text)
|
|
|
|
|
| if ratios["devanagari"] >= HINGLISH_MIN_RATIO and ratios["latin"] >= HINGLISH_MIN_RATIO:
|
| confidence = min(0.95, 0.7 + (min(ratios["devanagari"], ratios["latin"]) * 2))
|
| _log_detection("hinglish", confidence, start_time)
|
| return ("hinglish", confidence)
|
|
|
|
|
| detected_langs = detect_langs(text)
|
|
|
| if detected_langs:
|
| top_detection = detected_langs[0]
|
| lang_code = top_detection.lang
|
| confidence = top_detection.prob
|
|
|
|
|
| if lang_code == "en":
|
| _log_detection("en", confidence, start_time)
|
| return ("en", confidence)
|
| elif lang_code == "hi":
|
| _log_detection("hi", confidence, start_time)
|
| return ("hi", confidence)
|
| else:
|
|
|
|
|
| return _character_based_detection(text, has_dev, has_lat, start_time)
|
|
|
|
|
| return _character_based_detection(text, has_dev, has_lat, start_time)
|
|
|
| except LangDetectException as e:
|
| logger.debug(f"LangDetect exception: {e}")
|
|
|
| return _character_based_detection(text, has_devanagari(text), has_latin(text), start_time)
|
|
|
| except Exception as e:
|
| logger.warning(f"Language detection error: {e}")
|
| _log_detection(DEFAULT_LANGUAGE, ERROR_CONFIDENCE, start_time)
|
| return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
|
|
|
|
|
| def _character_based_detection(
|
| text: str,
|
| has_dev: bool,
|
| has_lat: bool,
|
| start_time: float
|
| ) -> Tuple[str, float]:
|
| """
|
| Fallback detection using character analysis.
|
|
|
| Args:
|
| text: Input text
|
| has_dev: Whether text contains Devanagari
|
| has_lat: Whether text contains Latin
|
| start_time: Detection start time for logging
|
|
|
| Returns:
|
| Tuple of (language_code, confidence)
|
| """
|
| if has_dev and has_lat:
|
| _log_detection("hinglish", 0.7, start_time)
|
| return ("hinglish", 0.7)
|
| elif has_dev:
|
| _log_detection("hi", 0.85, start_time)
|
| return ("hi", 0.85)
|
| elif has_lat:
|
| _log_detection("en", 0.75, start_time)
|
| return ("en", 0.75)
|
| else:
|
|
|
| _log_detection(DEFAULT_LANGUAGE, 0.5, start_time)
|
| return (DEFAULT_LANGUAGE, 0.5)
|
|
|
|
|
| def _get_script_ratios(text: str) -> dict:
|
| """
|
| Calculate the ratio of different scripts in text.
|
|
|
| Args:
|
| text: Input text
|
|
|
| Returns:
|
| Dictionary with ratios for each script type
|
| """
|
| if not text:
|
| return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
|
|
|
|
|
| alpha_chars = [char for char in text if char.isalpha()]
|
|
|
| if not alpha_chars:
|
| return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
|
|
|
| total_alpha = len(alpha_chars)
|
| devanagari_count = sum(1 for char in alpha_chars if is_devanagari_char(char))
|
| latin_count = sum(1 for char in alpha_chars if is_latin_char(char))
|
| other_count = total_alpha - devanagari_count - latin_count
|
|
|
| return {
|
| "devanagari": devanagari_count / total_alpha,
|
| "latin": latin_count / total_alpha,
|
| "other": other_count / total_alpha,
|
| }
|
|
|
|
|
| def _log_detection(lang: str, confidence: float, start_time: float) -> None:
|
| """Log detection result with timing."""
|
| elapsed_ms = (time.time() - start_time) * 1000
|
| logger.debug(f"Detected language: {lang}, confidence: {confidence:.2f}, time: {elapsed_ms:.2f}ms")
|
|
|
|
|
| def has_devanagari(text: str) -> bool:
|
| """
|
| Check if text contains Devanagari characters.
|
|
|
| Devanagari Unicode range: U+0900 to U+097F
|
|
|
| Args:
|
| text: Input text
|
|
|
| Returns:
|
| True if text contains Devanagari Unicode characters
|
| """
|
| if not text:
|
| return False
|
| return any(is_devanagari_char(char) for char in text)
|
|
|
|
|
| def has_latin(text: str) -> bool:
|
| """
|
| Check if text contains Latin characters.
|
|
|
| Args:
|
| text: Input text
|
|
|
| Returns:
|
| True if text contains ASCII letters (a-z, A-Z)
|
| """
|
| if not text:
|
| return False
|
| return any(is_latin_char(char) for char in text)
|
|
|
|
|
| def is_devanagari_char(char: str) -> bool:
|
| """
|
| Check if a single character is Devanagari.
|
|
|
| Args:
|
| char: Single character
|
|
|
| Returns:
|
| True if character is in Devanagari Unicode range
|
| """
|
| return "\u0900" <= char <= "\u097F"
|
|
|
|
|
| def is_latin_char(char: str) -> bool:
|
| """
|
| Check if a single character is Latin.
|
|
|
| Args:
|
| char: Single character
|
|
|
| Returns:
|
| True if character is ASCII letter
|
| """
|
| return "a" <= char.lower() <= "z"
|
|
|
|
|
| def get_language_name(code: str) -> str:
|
| """
|
| Get human-readable language name from code.
|
|
|
| Args:
|
| code: Language code ('en', 'hi', 'hinglish')
|
|
|
| Returns:
|
| Human-readable language name
|
| """
|
| names = {
|
| "en": "English",
|
| "hi": "Hindi",
|
| "hinglish": "Hinglish (Code-Mixed)",
|
| }
|
| return names.get(code, "Unknown")
|
|
|