Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Enhanced Translation Service for Multi-Language Support | |
| Handles translation of queries and responses for Sinhala, Tamil, Singlish, and English | |
| """ | |
| import requests | |
| import json | |
| import re | |
| import openai | |
| import google.generativeai as genai | |
| from typing import Dict, Any, Optional | |
| from config import Config | |
| from logger import get_logger | |
| from language_detector import LanguageDetector | |
| class TranslationService: | |
| def __init__(self): | |
| self.config = Config() | |
| self.openai_api_key = getattr(self.config, 'OPENAI_API_KEY', None) | |
| self.google_api_key = getattr(self.config, 'GOOGLE_API_KEY', None) | |
| self.google_model = getattr(self.config, 'GOOGLE_MODEL', 'gemini-1.5-flash') | |
| self.logger = get_logger(self.__class__.__name__) | |
| self.language_detector = LanguageDetector() | |
| # Controls | |
| import os | |
| self.use_pattern_translation = os.getenv('USE_PATTERN_TRANSLATION', 'false').lower() == 'true' | |
| self.force_llm_translation = os.getenv('FORCE_LLM_TRANSLATION', 'false').lower() == 'true' | |
| self.last_translation_method: Optional[str] = None | |
| # Free translation APIs | |
| self.libre_translate_url = "https://libretranslate.de/translate" # Free public instance | |
| self.mymemory_url = "https://api.mymemory.translated.net/get" | |
| # Tamil transport terms and their English equivalents | |
| self.tamil_transport_terms = { | |
| # Fare related | |
| 'எவ்வளவு': 'how much', | |
| 'விலை': 'price', | |
| 'கட்டணம்': 'fare', | |
| 'செலவு': 'cost', | |
| 'பேருந்து கட்டணம்': 'bus fare', | |
| 'ரயில் கட்டணம்': 'train fare', | |
| 'டிக்கெட் விலை': 'ticket price', | |
| # Locations | |
| 'கொழும்பு': 'Colombo', | |
| 'கண்டி': 'Kandy', | |
| 'காலி': 'Galle', | |
| 'மாத்தறை': 'Matara', | |
| 'அனுராதபுரம்': 'Anuradhapura', | |
| 'பனதுரை': 'Panadura', | |
| 'அலுத்துகமா': 'Aluthgama', | |
| 'நுகேகோடா': 'Nugegoda', | |
| 'தெஹிவாலா': 'Dehiwala', | |
| 'மொரட்டுவா': 'Moratuwa', | |
| # Direction words | |
| 'இருந்து': 'from', | |
| 'வரை': 'to', | |
| 'வழியாக': 'via', | |
| 'மூலம்': 'through', | |
| # Question words | |
| 'எங்கே': 'where', | |
| 'எப்போது': 'when', | |
| 'எப்படி': 'how', | |
| 'என்ன': 'what', | |
| 'யார்': 'who', | |
| # Comparison words | |
| 'உடன்': 'with', | |
| 'மற்றும்': 'and', | |
| 'அல்லது': 'or', | |
| 'அதிகம்': 'more', | |
| 'குறைவு': 'less', | |
| 'ஒரே': 'same', | |
| 'வேறு': 'different', | |
| 'ஒப்பிடு': 'compare', | |
| 'வித்தியாசம்': 'difference', | |
| # Time words | |
| 'இப்போது': 'now', | |
| 'இன்று': 'today', | |
| 'நாளை': 'tomorrow', | |
| 'நேற்று': 'yesterday', | |
| # Common verbs | |
| 'போ': 'go', | |
| 'வா': 'come', | |
| 'பார்': 'see', | |
| 'தெரிந்து கொள்': 'know', | |
| 'கண்டுபிடி': 'find', | |
| 'கற்றுக்கொள்': 'learn', | |
| 'பரிந்துரை': 'recommend', | |
| 'காட்டு': 'show', | |
| # Numbers and currency | |
| 'ரூபாய்': 'rupees', | |
| 'ரூ': 'rupees', | |
| # Common phrases | |
| 'இடையில்': 'between', | |
| 'உடன்': 'with', | |
| 'பாதைகள்': 'routes', | |
| 'பிரபலமான': 'popular', | |
| 'சராசரி': 'average', | |
| 'தரவு': 'data', | |
| 'புள்ளிவிவரங்கள்': 'statistics' | |
| } | |
| # Common transport terms in Sinhala and their English equivalents | |
| self.transport_terms = { | |
| # Fare related | |
| 'කීයද': 'how much', | |
| 'මිල': 'price', | |
| 'වාරික': 'fare', | |
| 'වාරිකය': 'fare', | |
| 'වාරිකව': 'fare', | |
| 'ගාස්තු': 'fare', | |
| 'ගාස්තුව': 'fare', | |
| 'ප්රවාහන ගාස්තු': 'transport fare', | |
| 'බස් ගාස්තු': 'bus fare', | |
| 'බස් ගාස්තුව': 'bus fare', | |
| 'රේල් ගාස්තු': 'train fare', | |
| 'රේල් ගාස්තුව': 'train fare', | |
| # Locations | |
| 'කොළඹ': 'Colombo', | |
| 'මහනුවර': 'Kandy', | |
| 'මහනුවරට': 'Kandy', | |
| 'ගාල්ල': 'Galle', | |
| 'ගාල්ලට': 'Galle', | |
| 'මාතර': 'Matara', | |
| 'මාතරට': 'Matara', | |
| 'අනුරාධපුර': 'Anuradhapura', | |
| 'අනුරාධපුරට': 'Anuradhapura', | |
| 'පානදුර': 'Panadura', | |
| 'පානදුරට': 'Panadura', | |
| 'අලුත්ගම': 'Aluthgama', | |
| 'අලුත්ගමට': 'Aluthgama', | |
| 'නුගේගොඩ': 'Nugegoda', | |
| 'නුගේගොඩට': 'Nugegoda', | |
| 'දෙහිවල': 'Dehiwala', | |
| 'දෙහිවලට': 'Dehiwala', | |
| 'මොරටුව': 'Moratuwa', | |
| 'මොරටුවට': 'Moratuwa', | |
| # Direction words | |
| 'වලින්': 'from', | |
| 'වල': 'from', | |
| 'ට': 'to', | |
| 'වෙත': 'to', | |
| 'සිට': 'from', | |
| 'දක්වා': 'to', | |
| 'සි': 'from', | |
| # Question words | |
| 'කොහෙද': 'where', | |
| 'කවදාද': 'when', | |
| 'කොහොමද': 'how', | |
| 'මොනවාද': 'what', | |
| 'කවුද': 'who', | |
| # Comparison words | |
| 'සමඟ': 'with', | |
| 'සහ': 'and', | |
| 'හෝ': 'or', | |
| 'වඩා': 'more', | |
| 'අඩු': 'less', | |
| 'සමාන': 'same', | |
| 'වෙනස': 'different', | |
| 'සසඳන්න': 'compare', | |
| 'සසඳන': 'compare', | |
| # Time words | |
| 'දැන්': 'now', | |
| 'අද': 'today', | |
| 'හෙට': 'tomorrow', | |
| 'ඊයේ': 'yesterday', | |
| # Common verbs | |
| 'යන්න': 'go', | |
| 'යන': 'go', | |
| 'එන්න': 'come', | |
| 'බලන්න': 'see', | |
| 'දැනගන්න': 'know', | |
| 'සොයන්න': 'find', | |
| 'සොයන': 'find', | |
| 'ඉගෙනගන්න': 'learn', | |
| 'නිර්දේශ': 'recommend', | |
| 'නිර්දේශ කරන්න': 'recommend', | |
| 'පෙන්වන්න': 'show', | |
| 'පෙන්වන': 'show', | |
| # Numbers and currency | |
| 'රුපියල්': 'rupees', | |
| 'රු': 'rupees', | |
| 'රුපියල': 'rupees', | |
| # Common phrases | |
| 'අතර': 'between', | |
| 'සහිත': 'with', | |
| 'මාර්ග': 'routes', | |
| 'මාර්ගවල': 'routes', | |
| 'ගමනාන්ත': 'destinations', | |
| 'ප්රසිද්ධ': 'popular', | |
| 'සාමාන්ය': 'average', | |
| 'සාමාන්යය': 'average', | |
| 'දත්ත': 'data', | |
| 'සංඛ්යාලේඛන': 'statistics' | |
| } | |
| # Sinhala script detection pattern | |
| self.sinhala_pattern = re.compile(r'[\u0D80-\u0DFF]') | |
| def is_sinhala_text(self, text: str) -> bool: | |
| """Check if text contains Sinhala characters""" | |
| detected = bool(self.sinhala_pattern.search(text)) | |
| self.logger.debug(f"Sinhala detection: detected={detected}, text='{text}'") | |
| return detected | |
| def is_tamil_text(self, text: str) -> bool: | |
| """Check if text contains Tamil characters""" | |
| tamil_pattern = re.compile(r'[\u0B80-\u0BFF]') | |
| detected = bool(tamil_pattern.search(text)) | |
| self.logger.debug(f"Tamil detection: detected={detected}, text='{text}'") | |
| return detected | |
| def is_singlish_text(self, text: str) -> bool: | |
| """Check if text is Singlish (Sinhala-English mixed)""" | |
| detection_result = self.language_detector.detect_language(text) | |
| return detection_result['language'] == 'singlish' | |
| def _map_sinhala_place(self, text: str) -> str: | |
| """Map a Sinhala place token to its English equivalent using known terms and suffix stripping.""" | |
| candidate = text.strip() | |
| # Direct map | |
| if candidate in self.transport_terms: | |
| return self.transport_terms[candidate] | |
| # Strip common Sinhala case particles/suffixes and try again | |
| base = re.sub(r'(ට|වෙත|දක්වා|වලින්|වල|සිට)$', '', candidate) | |
| if base in self.transport_terms: | |
| return self.transport_terms[base] | |
| return candidate | |
| def _map_tamil_place(self, text: str) -> str: | |
| """Map a Tamil place token to its English equivalent using known terms and suffix stripping.""" | |
| candidate = text.strip() | |
| # Direct map | |
| if candidate in self.tamil_transport_terms: | |
| return self.tamil_transport_terms[candidate] | |
| # Strip common Tamil case particles/suffixes and try again | |
| base = re.sub(r'(இருந்து|வரை|வழியாக|மூலம்)$', '', candidate) | |
| if base in self.tamil_transport_terms: | |
| return self.tamil_transport_terms[base] | |
| return candidate | |
| def _parse_sinhala_fare_query(self, query: str) -> Optional[str]: | |
| """Detect simple Sinhala fare queries and build a clean English query. | |
| Example handled: "කොළඹ සිට මහනුවරට ගාස්තුව කීයද?" -> "What is the fare from Colombo to Kandy?" | |
| """ | |
| try: | |
| # Quick check for fare-related tokens to avoid false positives | |
| if not any(tok in query for tok in ['ගාස්තු', 'ගාස්තුව', 'වාරික', 'වාරිකය', 'මිල']): | |
| return None | |
| # Extract source and destination around Sinhala "from" and "to" particles | |
| m = re.search(r'([\u0D80-\u0DFF\s]+?)\s*සිට\s*([\u0D80-\u0DFF\s]+?)(?:ට|වෙත|දක්වා)', query) | |
| if not m: | |
| return None | |
| src_si = m.group(1).strip() | |
| dst_si = m.group(2).strip() | |
| src_en = self._map_sinhala_place(src_si) | |
| dst_en = self._map_sinhala_place(dst_si) | |
| return f"What is the fare from {src_en} to {dst_en}?" | |
| except Exception: | |
| return None | |
| def _parse_tamil_fare_query(self, query: str) -> Optional[str]: | |
| """Detect simple Tamil fare queries and build a clean English query. | |
| Example handled: "கொழும்பு இருந்து கண்டி வரை கட்டணம் எவ்வளவு?" -> "What is the fare from Colombo to Kandy?" | |
| """ | |
| try: | |
| # Quick check for fare-related tokens to avoid false positives | |
| if not any(tok in query for tok in ['கட்டணம்', 'விலை', 'செலவு', 'எவ்வளவு']): | |
| return None | |
| # Extract source and destination around Tamil "from" and "to" particles | |
| m = re.search(r'([\u0B80-\u0BFF\s]+?)\s*இருந்து\s*([\u0B80-\u0BFF\s]+?)(?:வரை|வழியாக)', query) | |
| if not m: | |
| return None | |
| src_ta = m.group(1).strip() | |
| dst_ta = m.group(2).strip() | |
| src_en = self._map_tamil_place(src_ta) | |
| dst_en = self._map_tamil_place(dst_ta) | |
| return f"What is the fare from {src_en} to {dst_en}?" | |
| except Exception: | |
| return None | |
| def translate_with_llm(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]: | |
| """Translate using Google Gemini (preferred), fallback to OpenAI if configured. Preserve intent.""" | |
| if not (self.google_api_key or self.openai_api_key): | |
| return None | |
| try: | |
| # Determine source language | |
| if source_lang == 'auto': | |
| detection_result = self.language_detector.detect_language(text) | |
| detected_lang = detection_result['language'] | |
| if detected_lang == 'sinhala': | |
| source_lang = 'si' | |
| elif detected_lang == 'tamil': | |
| source_lang = 'ta' | |
| elif detected_lang == 'singlish': | |
| source_lang = 'singlish' | |
| else: | |
| source_lang = 'en' | |
| # Create language mapping | |
| lang_map = { | |
| ('si', 'en'): 'Sinhala to English', | |
| ('en', 'si'): 'English to Sinhala', | |
| ('ta', 'en'): 'Tamil to English', | |
| ('en', 'ta'): 'English to Tamil', | |
| ('singlish', 'en'): 'Singlish to English' | |
| } | |
| direction = lang_map.get((source_lang, target_lang)) | |
| if not direction: | |
| return None | |
| prompt = f""" | |
| Translate the following text from {direction}. | |
| Output only the translated text without quotes or extra commentary. | |
| Critically: Preserve the original intent and structure. Do not simplify. | |
| - If it is a comparison (e.g., includes "සසඳා බලන්න"/"සසඳන්න"), translate as a comparison (e.g., "Compare ..."). | |
| - Preserve conjunctions like "සහ" as "and" and keep all mentioned routes. | |
| - Keep direction words ("සිට" = from, "ට/වෙත/දක්වා" = to) and render routes fully. | |
| Use standard English city names: | |
| - මහනුවර = Kandy (not Mahanuwara) | |
| - කොළඹ = Colombo | |
| - ගාල්ල = Galle | |
| - මාතර = Matara | |
| - අනුරාධපුර = Anuradhapura | |
| Text to translate: {text} | |
| """ | |
| # Build few-shot examples to preserve comparison/imperative structure | |
| examples = [] | |
| # Add examples based on source language | |
| if source_lang == 'si': | |
| examples = [ | |
| ( | |
| "කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?", | |
| "What is the bus fare from Colombo to Kandy?" | |
| ), | |
| ( | |
| "කොළඹ සිට ගාල්ල දක්වා ටිකට් මිල කීයද?", | |
| "What is the ticket price from Colombo to Galle?" | |
| ), | |
| ( | |
| "කොළඹ සිට පානදුර දක්වා සහ කොළඹ සිට ගාල்ල දක්වා ගාස්තු සසඳා බලන්න.", | |
| "Compare fares from Colombo to Panadura and from Colombo to Galle." | |
| ), | |
| ( | |
| "රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග පෙන්වන්න.", | |
| "Show routes with fares under 500 rupees." | |
| ), | |
| ( | |
| "අඩු මිලේ මාර්ග නිර්දේශ කරන්න.", | |
| "Recommend cheap routes." | |
| ), | |
| ] | |
| elif source_lang == 'ta': | |
| examples = [ | |
| ( | |
| "கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?", | |
| "What is the bus fare from Colombo to Kandy?" | |
| ), | |
| ( | |
| "கொழும்பு இருந்து காலி வரை டிக்கெட் விலை எவ்வளவு?", | |
| "What is the ticket price from Colombo to Galle?" | |
| ), | |
| ( | |
| "கொழும்பு இருந்து பனதுரை வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு.", | |
| "Compare fares from Colombo to Panadura and from Colombo to Galle." | |
| ), | |
| ( | |
| "ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை காட்டு.", | |
| "Show routes with fares under 500 rupees." | |
| ), | |
| ( | |
| "குறைந்த விலையில் பாதைகளை பரிந்துரை.", | |
| "Recommend cheap routes." | |
| ), | |
| ] | |
| elif source_lang == 'singlish': | |
| examples = [ | |
| ( | |
| "කොළඹ සිට Kandy ගාස්තුව කීයද?", | |
| "What is the fare from Colombo to Kandy?" | |
| ), | |
| ( | |
| "Colombo සිට ගාල්ලට bus fare කීයද?", | |
| "What is the bus fare from Colombo to Galle?" | |
| ), | |
| ( | |
| "කොළඹ සිට Panadura සහ Colombo සිට Galle fares compare කරන්න.", | |
| "Compare fares from Colombo to Panadura and from Colombo to Galle." | |
| ), | |
| ] | |
| # Compose messages with few-shot conditioning | |
| def build_messages(txt: str): | |
| msgs = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a professional translator. Translate ONLY the specific text provided. " | |
| "Do not include examples or additional text. Return only the English translation without quotes. " | |
| "Canonical phrasing rules (use exactly): \n" | |
| "- Use 'Compare' for comparison requests.\n" | |
| "- Use 'Show' for requests like 'පෙන්වන්න' (do not use Provide/List).\n" | |
| "- Use 'How much is the' for 'කීයද' fare/price questions.\n" | |
| "- Use 'cheap' (not 'affordable').\n" | |
| "- Use 'under' (not 'below') for '< value'.\n" | |
| "- Use exact place names: මහනුවර=Kandy, කොළඹ=Colombo, ගාල්ල=Galle, මාතර=Matara, අනුරාධපුර=Anuradhapura.\n" | |
| "- Use 'from' for 'සිට' and 'to' for 'ට/වෙත/දක්වා'.\n" | |
| ), | |
| }, | |
| ] | |
| # Add only 2-3 examples to avoid confusion | |
| for si, en in examples[:2]: | |
| msgs.append({"role": "user", "content": f"Translate: {si}"}) | |
| msgs.append({"role": "assistant", "content": en}) | |
| msgs.append({"role": "user", "content": f"Translate: {txt}"}) | |
| return msgs | |
| # Try Google Gemini first | |
| translated = None | |
| if self.google_api_key: | |
| try: | |
| genai.configure(api_key=self.google_api_key) | |
| model = genai.GenerativeModel(self.google_model) | |
| # Create a more focused prompt for Gemini | |
| system_prompt = ( | |
| "You are a professional translator. Translate ONLY the specific text provided. " | |
| "Do not include examples or additional text. Return only the English translation without quotes. " | |
| "Use exact place names: මහනුවර=Kandy, කොළඹ=Colombo, ගාල්ල=Galle, මාතර=Matara, අනුරාධපුර=Anuradhapura. " | |
| "Use 'from' for 'සිට' and 'to' for 'ට/වෙත/දක්වා'. " | |
| "Use 'How much is the' for fare questions. Use 'Compare' for comparison requests." | |
| ) | |
| # Add a few examples | |
| examples_text = "" | |
| for si, en in examples[:2]: | |
| examples_text += f"Example: {si} -> {en}\n" | |
| prompt_text = f"{system_prompt}\n\n{examples_text}\nNow translate: {text}" | |
| response = model.generate_content(prompt_text) | |
| translated = (response.text or "").strip() | |
| # Clean up the response - remove any extra text | |
| if translated: | |
| # Split by newlines and take the first line that looks like a translation | |
| lines = translated.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| if line and not line.startswith('Example:') and not line.startswith('Now translate:'): | |
| translated = line | |
| break | |
| self.last_translation_method = 'gemini' | |
| except Exception as e: | |
| self.logger.warning(f"Gemini translation error: {e}") | |
| translated = None | |
| # Fallback to OpenAI if available | |
| if not translated and self.openai_api_key: | |
| try: | |
| from openai import OpenAI | |
| client = OpenAI(api_key=self.openai_api_key) | |
| response = client.chat.completions.create( | |
| model=self.config.OPENAI_MODEL, | |
| max_tokens=150, | |
| temperature=0.3, | |
| messages=build_messages(text) | |
| ) | |
| translated = response.choices[0].message.content.strip() | |
| self.last_translation_method = 'llm' | |
| except Exception as sdk_err: | |
| import openai | |
| try: | |
| openai.api_key = self.openai_api_key | |
| response = openai.ChatCompletion.create( | |
| model=self.config.OPENAI_MODEL, | |
| max_tokens=150, | |
| temperature=0.3, | |
| messages=build_messages(text) | |
| ) | |
| translated = response.choices[0].message.content.strip() | |
| self.last_translation_method = 'llm' | |
| except Exception: | |
| raise sdk_err | |
| if translated.startswith('"') and translated.endswith('"'): | |
| translated = translated[1:-1] | |
| return translated if translated else None | |
| except Exception as e: | |
| self.logger.warning(f"LLM translation error: {e}") | |
| return None | |
| def translate_with_libre_translate(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]: | |
| """Translate using LibreTranslate (free public API)""" | |
| try: | |
| # Map language codes | |
| lang_map = { | |
| 'si': 'si', # Sinhala | |
| 'en': 'en', # English | |
| 'auto': 'auto' | |
| } | |
| source = lang_map.get(source_lang, 'auto') | |
| target = lang_map.get(target_lang, 'en') | |
| payload = { | |
| 'q': text, | |
| 'source': source, | |
| 'target': target, | |
| 'format': 'text' | |
| } | |
| headers = { | |
| 'Content-Type': 'application/json' | |
| } | |
| response = requests.post( | |
| self.libre_translate_url, | |
| json=payload, | |
| headers=headers, | |
| timeout=10 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| translated = result.get('translatedText') | |
| self.logger.debug(f"LibreTranslate success: '{text}' -> '{translated}'") | |
| self.last_translation_method = 'libretranslate' | |
| return translated | |
| return None | |
| except Exception as e: | |
| self.logger.warning(f"LibreTranslate error: {e}") | |
| return None | |
| def translate_with_mymemory(self, text: str, target_lang: str, source_lang: str = 'auto') -> Optional[str]: | |
| """Translate using MyMemory (free API)""" | |
| try: | |
| # Map language codes | |
| lang_map = { | |
| 'si': 'si', # Sinhala | |
| 'en': 'en', # English | |
| 'auto': 'auto' | |
| } | |
| source = lang_map.get(source_lang, 'auto') | |
| langpair = f"{source}|{target_lang}" | |
| params = { | |
| 'q': text, | |
| 'langpair': langpair | |
| } | |
| response = requests.get( | |
| self.mymemory_url, | |
| params=params, | |
| timeout=10 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| translated = result.get('responseData', {}).get('translatedText') | |
| self.logger.debug(f"MyMemory success: '{text}' -> '{translated}'") | |
| self.last_translation_method = 'mymemory' | |
| return translated | |
| return None | |
| except Exception as e: | |
| self.logger.warning(f"MyMemory translation error: {e}") | |
| return None | |
| def translate_with_dictionary(self, text: str, target_lang: str, source_lang: str = 'auto') -> str: | |
| """Translate using dictionary-based approach""" | |
| if target_lang == 'en': | |
| # Determine source language if auto | |
| if source_lang == 'auto': | |
| detection_result = self.language_detector.detect_language(text) | |
| detected_lang = detection_result['language'] | |
| if detected_lang == 'sinhala': | |
| source_lang = 'si' | |
| elif detected_lang == 'tamil': | |
| source_lang = 'ta' | |
| else: | |
| source_lang = 'si' # Default to Sinhala | |
| translated = text | |
| if source_lang == 'si': | |
| # Sinhala to English | |
| for sinhala, english in self.transport_terms.items(): | |
| translated = translated.replace(sinhala, english) | |
| elif source_lang == 'ta': | |
| # Tamil to English | |
| for tamil, english in self.tamil_transport_terms.items(): | |
| translated = translated.replace(tamil, english) | |
| return translated | |
| elif target_lang == 'si': | |
| # English to Sinhala | |
| translated = text | |
| for sinhala, english in self.transport_terms.items(): | |
| translated = translated.replace(english, sinhala) | |
| return translated | |
| elif target_lang == 'ta': | |
| # English to Tamil | |
| translated = text | |
| for tamil, english in self.tamil_transport_terms.items(): | |
| translated = translated.replace(english, tamil) | |
| return translated | |
| return text | |
| def translate_text(self, text: str, target_lang: str, source_lang: str = 'auto') -> str: | |
| """Main translation method with multiple fallbacks""" | |
| if not text or not text.strip(): | |
| return text | |
| # Try translation methods | |
| if self.force_llm_translation: | |
| translation_methods = [ | |
| ('LLM', lambda: self.translate_with_llm(text, target_lang, source_lang)) | |
| ] | |
| else: | |
| translation_methods = [ | |
| ('LLM', lambda: self.translate_with_llm(text, target_lang, source_lang)), | |
| ('MyMemory', lambda: self.translate_with_mymemory(text, target_lang, source_lang)), | |
| ('LibreTranslate', lambda: self.translate_with_libre_translate(text, target_lang, source_lang)), | |
| ('Dictionary', lambda: self.translate_with_dictionary(text, target_lang)) | |
| ] | |
| for method_name, method_func in translation_methods: | |
| try: | |
| result = method_func() | |
| if result and result.strip(): | |
| self.logger.info(f"Translation successful using {method_name}") | |
| if not self.last_translation_method: | |
| self.last_translation_method = method_name.lower() | |
| return result.strip() | |
| except Exception as e: | |
| self.logger.warning(f"{method_name} translation failed: {e}") | |
| continue | |
| # Final fallback | |
| result = self.translate_with_dictionary(text, target_lang, source_lang) | |
| self.last_translation_method = 'dictionary' | |
| return result | |
| def translate_query(self, query: str) -> Dict[str, Any]: | |
| """Translate a user query from any supported language to English""" | |
| # Detect the language of the input | |
| detection_result = self.language_detector.detect_language(query) | |
| detected_language = detection_result['language'] | |
| # If it's already English, return as is | |
| if detected_language == 'english': | |
| return { | |
| 'is_sinhala': False, | |
| 'is_tamil': False, | |
| 'is_singlish': False, | |
| 'detected_language': 'english', | |
| 'original_query': query, | |
| 'translated_query': query, | |
| 'translation_method': 'none', | |
| 'detection_confidence': detection_result['confidence'] | |
| } | |
| # Handle pattern-based parsing for specific languages | |
| if self.use_pattern_translation: | |
| parsed = None | |
| if detected_language == 'sinhala': | |
| parsed = self._parse_sinhala_fare_query(query) | |
| elif detected_language == 'tamil': | |
| parsed = self._parse_tamil_fare_query(query) | |
| if parsed: | |
| self.logger.info(f"Pattern-based {detected_language} fare parse: '{query}' -> '{parsed}'") | |
| return { | |
| 'is_sinhala': detected_language == 'sinhala', | |
| 'is_tamil': detected_language == 'tamil', | |
| 'is_singlish': detected_language == 'singlish', | |
| 'detected_language': detected_language, | |
| 'original_query': query, | |
| 'translated_query': parsed, | |
| 'translation_method': 'pattern', | |
| 'detection_confidence': detection_result['confidence'] | |
| } | |
| # Determine source language code for translation | |
| source_lang = 'si' if detected_language == 'sinhala' else 'ta' if detected_language == 'tamil' else 'si' | |
| # Translate to English | |
| translated = self.translate_text(query, 'en', source_lang) | |
| # Normalize English synonyms to expected NLP vocabulary | |
| translated = self._normalize_english_query(translated) | |
| method = self.last_translation_method or ('llm' if self.openai_api_key else 'dictionary') | |
| self.logger.info(f"Translated {detected_language} query ({method}): '{query}' -> '{translated}'") | |
| return { | |
| 'is_sinhala': detected_language == 'sinhala', | |
| 'is_tamil': detected_language == 'tamil', | |
| 'is_singlish': detected_language == 'singlish', | |
| 'detected_language': detected_language, | |
| 'original_query': query, | |
| 'translated_query': translated, | |
| 'translation_method': method, | |
| 'detection_confidence': detection_result['confidence'] | |
| } | |
| def _normalize_english_query(self, text: str) -> str: | |
| """Normalize English synonyms to match NLP patterns (fare/price/cost).""" | |
| if not text: | |
| return text | |
| normalized = text | |
| replacements = { | |
| 'fees': 'fare', | |
| 'fee': 'fare', | |
| 'charges': 'cost', | |
| 'charge': 'cost', | |
| 'ticket price': 'fare', | |
| 'ticket fare': 'fare', | |
| 'bus ticket': 'bus fare', | |
| } | |
| # Lowercase operate, then restore original casing minimally by returning lowercase; downstream lowercases anyway | |
| lower = normalized.lower() | |
| for old, new in replacements.items(): | |
| lower = lower.replace(old, new) | |
| return lower | |
| def translate_response(self, response: Dict[str, Any], target_language: str = None) -> Dict[str, Any]: | |
| """Translate response back to the detected language""" | |
| translated_response = response.copy() | |
| # Determine target language from translation_info if not provided | |
| if target_language is None and 'translation_info' in response: | |
| translation_info = response['translation_info'] | |
| if translation_info.get('detected_language'): | |
| detected_lang = translation_info['detected_language'] | |
| if detected_lang == 'sinhala': | |
| target_language = 'si' | |
| elif detected_lang == 'tamil': | |
| target_language = 'ta' | |
| else: | |
| target_language = 'si' # Default to Sinhala | |
| else: | |
| target_language = 'si' # Default to Sinhala | |
| elif target_language is None: | |
| target_language = 'si' # Default to Sinhala | |
| # Translate the main message | |
| if 'message' in response: | |
| translated_response['message'] = self.translate_text( | |
| response['message'], target_language, 'en' | |
| ) | |
| # Translate suggestions if any | |
| if 'suggestions' in response and response['suggestions']: | |
| translated_response['suggestions'] = [ | |
| self.translate_text(suggestion, target_language, 'en') | |
| for suggestion in response['suggestions'] | |
| ] | |
| # Translate corrections if any | |
| if 'corrections' in response and response['corrections']: | |
| translated_corrections = [] | |
| for correction in response['corrections']: | |
| translated_correction = correction.copy() | |
| if 'original' in correction: | |
| translated_correction['original'] = self.translate_text( | |
| correction['original'], target_language, 'en' | |
| ) | |
| if 'corrected' in correction: | |
| translated_correction['corrected'] = self.translate_text( | |
| correction['corrected'], target_language, 'en' | |
| ) | |
| translated_corrections.append(translated_correction) | |
| translated_response['corrections'] = translated_corrections | |
| # Add translation metadata | |
| translated_response['translation_info'] = { | |
| 'translated': True, | |
| 'target_language': target_language, | |
| 'translation_method': 'llm' if self.openai_api_key else 'dictionary' | |
| } | |
| return translated_response | |
| def get_sinhala_examples(self) -> Dict[str, Any]: | |
| """Get example queries in Sinhala""" | |
| sinhala_examples = { | |
| 'fare_queries': [ | |
| { | |
| 'query': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?', | |
| 'description': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව සොයන්න' | |
| }, | |
| { | |
| 'query': 'මාතර සිට ගාල්ලට යන මිල කීයද?', | |
| 'description': 'මාතර සිට ගාල්ලට යන මිල සොයන්න' | |
| }, | |
| { | |
| 'query': 'අනුරාධපුර සිට කොළඹට යන වාරිකය', | |
| 'description': 'අනුරාධපුර සිට කොළඹට යන වාරිකය සොයන්න' | |
| } | |
| ], | |
| 'comparison_queries': [ | |
| { | |
| 'query': 'කොළඹ සිට මහනුවරට සහ කොළඹ සිට ගාල්ලට යන ගාස්තු සසඳන්න', | |
| 'description': 'විවිධ මාර්ගවල ගාස්තු සසඳන්න' | |
| }, | |
| { | |
| 'query': 'කොළඹ සිට මහනුවරට සහ කොළඹ සිට අනුරාධපුරට යන ගාස්තුවල වෙනස කීයද?', | |
| 'description': 'මාර්ග දෙකක ගාස්තු වෙනස සොයන්න' | |
| } | |
| ], | |
| 'range_queries': [ | |
| { | |
| 'query': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග සොයන්න', | |
| 'description': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග සොයන්න' | |
| }, | |
| { | |
| 'query': 'රුපියල් 200 සහ 800 අතර ගාස්තු සහිත මාර්ග පෙන්වන්න', | |
| 'description': 'රුපියල් 200 සහ 800 අතර ගාස්තු සහිත මාර්ග සොයන්න' | |
| } | |
| ], | |
| 'recommendation_queries': [ | |
| { | |
| 'query': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න', | |
| 'description': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න' | |
| }, | |
| { | |
| 'query': 'ප්රසිද්ධ ගමනාන්ත පෙන්වන්න', | |
| 'description': 'ප්රසිද්ධ ගමනාන්ත සොයන්න' | |
| } | |
| ], | |
| 'statistical_queries': [ | |
| { | |
| 'query': 'සාමාන්ය ගාස්තුව කීයද?', | |
| 'description': 'සාමාන්ය ගාස්තුව සොයන්න' | |
| }, | |
| { | |
| 'query': 'දත්ත ගබඩා සංඛ්යාලේඛන', | |
| 'description': 'දත්ත ගබඩා සංඛ්යාලේඛන සොයන්න' | |
| } | |
| ] | |
| } | |
| return sinhala_examples | |
| def get_tamil_examples(self) -> Dict[str, Any]: | |
| """Get example queries in Tamil""" | |
| tamil_examples = { | |
| 'fare_queries': [ | |
| { | |
| 'query': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?', | |
| 'description': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் கண்டுபிடி' | |
| }, | |
| { | |
| 'query': 'மாத்தறை இருந்து காலி வரை விலை எவ்வளவு?', | |
| 'description': 'மாத்தறை இருந்து காலி வரை விலை கண்டுபிடி' | |
| }, | |
| { | |
| 'query': 'அனுராதபுரம் இருந்து கொழும்பு வரை கட்டணம்', | |
| 'description': 'அனுராதபுரம் இருந்து கொழும்பு வரை கட்டணம் கண்டுபிடி' | |
| } | |
| ], | |
| 'comparison_queries': [ | |
| { | |
| 'query': 'கொழும்பு இருந்து கண்டி வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு', | |
| 'description': 'வெவ்வேறு பாதைகளின் கட்டணம் ஒப்பிடு' | |
| }, | |
| { | |
| 'query': 'கொழும்பு இருந்து கண்டி வரை மற்றும் கொழும்பு இருந்து அனுராதபுரம் வரை கட்டணத்தின் வித்தியாசம் எவ்வளவு?', | |
| 'description': 'இரண்டு பாதைகளின் கட்டண வித்தியாசம் கண்டுபிடி' | |
| } | |
| ], | |
| 'range_queries': [ | |
| { | |
| 'query': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை கண்டுபிடி', | |
| 'description': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை கண்டுபிடி' | |
| }, | |
| { | |
| 'query': 'ரூபாய் 200 மற்றும் 800 இடையில் கட்டணம் உள்ள பாதைகளை காட்டு', | |
| 'description': 'ரூபாய் 200 மற்றும் 800 இடையில் கட்டணம் உள்ள பாதைகளை கண்டுபிடி' | |
| } | |
| ], | |
| 'recommendation_queries': [ | |
| { | |
| 'query': 'குறைந்த விலையில் பாதைகளை பரிந்துரை', | |
| 'description': 'குறைந்த விலையில் பாதைகளை பரிந்துரை' | |
| }, | |
| { | |
| 'query': 'பிரபலமான இலக்குகளை காட்டு', | |
| 'description': 'பிரபலமான இலக்குகளை கண்டுபிடி' | |
| } | |
| ], | |
| 'statistical_queries': [ | |
| { | |
| 'query': 'சராசரி கட்டணம் எவ்வளவு?', | |
| 'description': 'சராசரி கட்டணம் கண்டுபிடி' | |
| }, | |
| { | |
| 'query': 'தரவு சேமிப்பக புள்ளிவிவரங்கள்', | |
| 'description': 'தரவு சேமிப்பக புள்ளிவிவரங்கள் கண்டுபிடி' | |
| } | |
| ] | |
| } | |
| return tamil_examples | |
| def test_translation(self) -> Dict[str, Any]: | |
| """Test translation functionality on transportation-related queries in multiple languages.""" | |
| test_cases = [ | |
| # Sinhala test cases | |
| { | |
| 'language': 'sinhala', | |
| 'original': 'කොළඹ සිට මහනුවරට යන බස් ගාස්තුව කීයද?', | |
| 'expected_english': 'What is the bus fare from Colombo to Kandy?' | |
| }, | |
| { | |
| 'language': 'sinhala', | |
| 'original': 'මාතර සිට ගාල්ලට යන මිල කීයද?', | |
| 'expected_english': 'How much is the price from Matara to Galle?' | |
| }, | |
| { | |
| 'language': 'sinhala', | |
| 'original': 'කොළඹ සිට පානදුර දක්වා සහ කොළඹ සිට ගාල්ල දක්වා ගාස්තු සසඳා බලන්න.', | |
| 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.' | |
| }, | |
| { | |
| 'language': 'sinhala', | |
| 'original': 'රුපියල් 500 ට අඩු ගාස්තු සහිත මාර්ග පෙන්වන්න.', | |
| 'expected_english': 'Show routes with fares under 500 rupees.' | |
| }, | |
| { | |
| 'language': 'sinhala', | |
| 'original': 'අඩු මිලේ මාර්ග නිර්දේශ කරන්න.', | |
| 'expected_english': 'Recommend cheap routes.' | |
| }, | |
| # Tamil test cases | |
| { | |
| 'language': 'tamil', | |
| 'original': 'கொழும்பு இருந்து கண்டி வரை பேருந்து கட்டணம் எவ்வளவு?', | |
| 'expected_english': 'What is the bus fare from Colombo to Kandy?' | |
| }, | |
| { | |
| 'language': 'tamil', | |
| 'original': 'மாத்தறை இருந்து காலி வரை விலை எவ்வளவு?', | |
| 'expected_english': 'How much is the price from Matara to Galle?' | |
| }, | |
| { | |
| 'language': 'tamil', | |
| 'original': 'கொழும்பு இருந்து பனதுரை வரை மற்றும் கொழும்பு இருந்து காலி வரை கட்டணம் ஒப்பிடு.', | |
| 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.' | |
| }, | |
| { | |
| 'language': 'tamil', | |
| 'original': 'ரூபாய் 500 க்கு குறைவான கட்டணம் உள்ள பாதைகளை காட்டு.', | |
| 'expected_english': 'Show routes with fares under 500 rupees.' | |
| }, | |
| { | |
| 'language': 'tamil', | |
| 'original': 'குறைந்த விலையில் பாதைகளை பரிந்துரை.', | |
| 'expected_english': 'Recommend cheap routes.' | |
| }, | |
| # Singlish test cases | |
| { | |
| 'language': 'singlish', | |
| 'original': 'කොළඹ සිට Kandy ගාස්තුව කීයද?', | |
| 'expected_english': 'What is the fare from Colombo to Kandy?' | |
| }, | |
| { | |
| 'language': 'singlish', | |
| 'original': 'Colombo සිට ගාල්ලට bus fare කීයද?', | |
| 'expected_english': 'What is the bus fare from Colombo to Galle?' | |
| }, | |
| { | |
| 'language': 'singlish', | |
| 'original': 'කොළඹ සිට Panadura සහ Colombo සිට Galle fares compare කරන්න.', | |
| 'expected_english': 'Compare fares from Colombo to Panadura and from Colombo to Galle.' | |
| }, | |
| # English test cases | |
| { | |
| 'language': 'english', | |
| 'original': 'What is the fare from Colombo to Kandy?', | |
| 'expected_english': 'What is the fare from Colombo to Kandy?' | |
| }, | |
| { | |
| 'language': 'english', | |
| 'original': 'Show me routes from Panadura', | |
| 'expected_english': 'Show me routes from Panadura' | |
| } | |
| ] | |
| results = [] | |
| total_exact = 0 | |
| total_good = 0 | |
| total_tests = len(test_cases) | |
| for test_case in test_cases: | |
| original = test_case['original'] | |
| expected = test_case['expected_english'] | |
| language = test_case['language'] | |
| # Detect language | |
| detection_result = self.language_detector.detect_language(original) | |
| detected_language = detection_result['language'] | |
| # Reset method tracker and translate | |
| self.last_translation_method = None | |
| translated = self.translate_text(original, 'en', 'auto') or '' | |
| tr = translated.strip() | |
| ex = expected.strip() | |
| tr_low = tr.lower() | |
| ex_low = ex.lower() | |
| # Accuracy heuristic | |
| if tr_low == ex_low: | |
| accuracy = 'exact' | |
| total_exact += 1 | |
| total_good += 1 | |
| elif tr_low in ex_low or ex_low in tr_low: | |
| accuracy = 'good' | |
| total_good += 1 | |
| else: | |
| accuracy = 'partial' | |
| # Intent preservation check for comparisons | |
| intent_preserved = True | |
| if language in ['sinhala', 'tamil'] and ('සසඳ' in original or 'ஒப்பிடு' in original): | |
| intent_preserved = ('compare' in tr_low) | |
| results.append({ | |
| 'original_query': original, | |
| 'language': language, | |
| 'detected_language': detected_language, | |
| 'translated_english': tr, | |
| 'expected_english': ex, | |
| 'translation_accuracy': accuracy, | |
| 'intent_preserved': intent_preserved, | |
| 'method_used': self.last_translation_method or ('llm' if self.openai_api_key else 'dictionary'), | |
| 'detection_confidence': detection_result['confidence'] | |
| }) | |
| summary = { | |
| 'total_tests': total_tests, | |
| 'exact_matches': total_exact, | |
| 'good_or_better': total_good, | |
| 'accuracy_rate_percent': round((total_good / total_tests) * 100, 2) if total_tests else 0 | |
| } | |
| self.logger.info(f"Translation test summary: {summary}") | |
| return { | |
| 'translation_service_status': 'active', | |
| 'supported_languages': ['sinhala', 'tamil', 'singlish', 'english'], | |
| 'available_methods': { | |
| 'llm': self.openai_api_key is not None, | |
| 'libre_translate': True, | |
| 'mymemory': True, | |
| 'dictionary': True | |
| }, | |
| 'summary': summary, | |
| 'test_results': results | |
| } | |