Spaces:
Paused
Paused
| """ | |
| TTS Text Preprocessing Utilities with Multilingual Support | |
| """ | |
| import re | |
| import json | |
| from typing import Dict, Set, Optional | |
| from num2words import num2words | |
| from pathlib import Path | |
| from config.locale_manager import LocaleManager | |
| class TTSPreprocessor: | |
| """Text preprocessor for TTS providers with multilingual support""" | |
| # Preprocessing flags | |
| PREPROCESS_NUMBERS = "numbers" | |
| PREPROCESS_CURRENCY = "currency" | |
| PREPROCESS_TIME = "time" | |
| PREPROCESS_DATE = "date" | |
| PREPROCESS_CODES = "codes" | |
| PREPROCESS_PERCENTAGE = "percentage" | |
| def __init__(self, language: str = "tr"): | |
| self.language = language | |
| self.locale_data = LocaleManager.get_locale(language) | |
| def preprocess(self, text: str, flags: Set[str]) -> str: | |
| """Apply preprocessing based on flags""" | |
| if self.PREPROCESS_CURRENCY in flags: | |
| text = self._process_currency(text) | |
| if self.PREPROCESS_TIME in flags: | |
| text = self._process_time(text) | |
| if self.PREPROCESS_DATE in flags: | |
| text = self._process_date(text) | |
| if self.PREPROCESS_CODES in flags: | |
| text = self._process_codes(text) | |
| if self.PREPROCESS_PERCENTAGE in flags: | |
| text = self._process_percentage(text) | |
| # Numbers should be processed last to avoid conflicts | |
| if self.PREPROCESS_NUMBERS in flags: | |
| text = self._process_numbers(text) | |
| return text | |
| def _process_numbers(self, text: str) -> str: | |
| """Convert numbers to words based on locale""" | |
| decimal_sep = self.locale_data["numbers"]["decimal_separator"] | |
| thousands_sep = self.locale_data["numbers"]["thousands_separator"] | |
| decimal_word = self.locale_data["numbers"]["decimal_word"] | |
| threshold = self.locale_data.get("small_number_threshold", 100) | |
| def replace_number(match): | |
| num_str = match.group() | |
| # Normalize number format | |
| if self.language == "tr": | |
| # Turkish: 1.234,56 -> 1234.56 | |
| num_str = num_str.replace('.', '').replace(',', '.') | |
| else: | |
| # English: 1,234.56 -> 1234.56 | |
| num_str = num_str.replace(',', '') | |
| try: | |
| num = float(num_str) | |
| if num.is_integer(): | |
| num = int(num) | |
| # Keep small numbers as is based on threshold | |
| if isinstance(num, int) and 0 <= num <= threshold: | |
| return str(num) | |
| # Convert large numbers to words | |
| if isinstance(num, int): | |
| try: | |
| return num2words(num, lang=self.language) | |
| except NotImplementedError: | |
| # Fallback to English if language not supported | |
| return num2words(num, lang='en') | |
| else: | |
| # Handle decimal | |
| integer_part = int(num) | |
| decimal_part = int((num - integer_part) * 100) | |
| try: | |
| int_words = num2words(integer_part, lang=self.language) | |
| dec_words = num2words(decimal_part, lang=self.language) | |
| return f"{int_words} {decimal_word} {dec_words}" | |
| except NotImplementedError: | |
| # Fallback | |
| int_words = num2words(integer_part, lang='en') | |
| dec_words = num2words(decimal_part, lang='en') | |
| return f"{int_words} {decimal_word} {dec_words}" | |
| except: | |
| return num_str | |
| # Match numbers with locale-specific format | |
| if self.language == "tr": | |
| pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b' | |
| else: | |
| pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b' | |
| return re.sub(pattern, replace_number, text) | |
| def _process_codes(self, text: str) -> str: | |
| """Process codes like PNR, flight numbers - language agnostic""" | |
| def spell_code(match): | |
| code = match.group() | |
| return ' '.join(code) | |
| # Match uppercase letters followed by numbers | |
| pattern = r'\b[A-Z]{2,5}\d{2,5}\b' | |
| return re.sub(pattern, spell_code, text) | |
| def _process_currency(self, text: str) -> str: | |
| """Process currency symbols and amounts based on locale""" | |
| currency_data = self.locale_data.get("currency", {}) | |
| if not isinstance(currency_data, dict): | |
| return text | |
| symbol = currency_data.get("symbol", "") | |
| word = currency_data.get("word", "") | |
| code = currency_data.get("code", "") | |
| position = currency_data.get("position", "before") | |
| if symbol and word: | |
| # Replace standalone symbols | |
| text = text.replace(symbol, f" {word} ") | |
| # Replace symbol with amount | |
| if position == "before": | |
| # $100 -> 100 dollar | |
| pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)' | |
| text = re.sub(pattern, rf'\1 {word}', text) | |
| else: | |
| # 100₺ -> 100 lira | |
| pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}' | |
| text = re.sub(pattern, rf'\1 {word}', text) | |
| # Process currency codes | |
| if code and word: | |
| pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b' | |
| text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE) | |
| return text | |
| def _process_percentage(self, text: str) -> str: | |
| """Process percentage symbols based on locale""" | |
| percentage = self.locale_data.get("percentage", {}) | |
| if not isinstance(percentage, dict): | |
| return text | |
| word = percentage.get("word", "percent") | |
| position = percentage.get("position", "after") | |
| if position == "before": | |
| # %50 -> yüzde 50 | |
| pattern = r'%\s*(\d+(?:[.,]\d+)?)' | |
| replacement = rf'{word} \1' | |
| else: | |
| # 50% -> 50 percent | |
| pattern = r'(\d+(?:[.,]\d+)?)\s*%' | |
| replacement = rf'\1 {word}' | |
| return re.sub(pattern, replacement, text) | |
| def _process_date(self, text: str) -> str: | |
| """Process date formats based on locale""" | |
| months = self.locale_data.get("months", {}) | |
| date_format = self.locale_data.get("date_format", "YYYY-MM-DD") | |
| if not isinstance(months, dict): | |
| return text | |
| # Convert ISO format dates | |
| def replace_date(match): | |
| year, month, day = match.groups() | |
| month_name = months.get(month, month) | |
| # Format based on locale preference | |
| if "DD.MM.YYYY" in date_format: | |
| # Turkish format with month name | |
| return f"{int(day)} {month_name} {year}" | |
| elif "MM/DD/YYYY" in date_format: | |
| # US format with month name | |
| return f"{month_name} {int(day)}, {year}" | |
| else: | |
| return match.group() | |
| pattern = r'(\d{4})-(\d{2})-(\d{2})' | |
| return re.sub(pattern, replace_date, text) | |
| def _process_time(self, text: str) -> str: | |
| """Process time formats based on locale""" | |
| time_data = self.locale_data.get("time", {}) | |
| if not isinstance(time_data, dict): | |
| time_format = "word" | |
| separator = " " | |
| else: | |
| time_format = time_data.get("format", "word") | |
| separator = time_data.get("separator", " ") | |
| def replace_time(match): | |
| hour, minute = match.groups() | |
| hour_int = int(hour) | |
| minute_int = int(minute) | |
| if time_format == "word": | |
| try: | |
| hour_word = num2words(hour_int, lang=self.language) | |
| minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else "" | |
| if minute_int == 0: | |
| return hour_word | |
| else: | |
| return f"{hour_word}{separator}{minute_word}" | |
| except NotImplementedError: | |
| return f"{hour} {minute}" | |
| else: | |
| return f"{hour} {minute}" | |
| pattern = r'(\d{1,2}):(\d{2})' | |
| return re.sub(pattern, replace_time, text) |