#!/usr/bin/env python3 """ Test script for TTS text cleaning functionality """ import sys import os # Add the parent directory to sys.path to import from src sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from src.services.tts_service import TTSService def test_text_cleaning(): """Test the text cleaning functionality""" print("πŸ§ͺ Testing TTS Text Cleaning Function\n") # Create TTS service instance (without API key for testing cleaning function only) os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key" # Dummy key for testing try: tts = TTSService() except: # If there are import issues, create a simple test version class TestTTSService: def clean_text_for_speech(self, text): import re if not text or not isinstance(text, str): return "" # Remove markdown formatting text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold **text** text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic *text* text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text` text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ### text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove links [text](url) -> text # Remove emojis and special unicode characters text = re.sub(r'[\U0001F600-\U0001F64F]', '', text) text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text) text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text) text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text) text = re.sub(r'[\U00002600-\U000026FF]', '', text) text = re.sub(r'[\U00002700-\U000027BF]', '', text) text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text) text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text) # Remove problematic punctuation and special characters text = re.sub(r'[""'']', '"', text) text = re.sub(r'[–—]', '-', text) text = re.sub(r'[…]', '...', text) text = re.sub(r'[«»]', '"', text) text = re.sub(r'[β€Ήβ€Ί]', "'", text) # Remove control characters and zero-width characters text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text) text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text) # Clean up extra whitespace text = re.sub(r'\s+', ' ', text) text = text.strip() # Remove multiple consecutive punctuation text = re.sub(r'\.{3,}', '...', text) text = re.sub(r'!{2,}', '!', text) text = re.sub(r'\?{2,}', '?', text) # Ensure proper sentence endings text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) return text tts = TestTTSService() # Test cases test_cases = [ { "name": "Simple text", "input": "Hello, how are you today?", "expected_clean": True }, { "name": "Text with emojis", "input": "Great job! πŸŽ‰ You're doing amazing! 🌟 Keep it up! πŸ’ͺ", "expected_clean": True }, { "name": "Markdown formatting", "input": "This is **bold** and this is *italic* and `code`", "expected_clean": True }, { "name": "Complex markdown with links", "input": "Check out [this link](https://example.com) and ## Header text", "expected_clean": True }, { "name": "Mixed content", "input": "🎯 **Practice Goal**: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. πŸ“šβœ¨", "expected_clean": True }, { "name": "Smart quotes and dashes", "input": "\"Hello world\" and 'smart quotes' with emβ€”dash and en–dash…", "expected_clean": True }, { "name": "Multiple punctuation", "input": "Wow!!! This is amazing??? Really......", "expected_clean": True }, { "name": "Real AI response", "input": "🌟 **Excellent!** You did a great job with that conversation! Here are some tips:\n\n- Use *natural* expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\nπŸ’ͺ Keep practicing! 🎯", "expected_clean": True } ] print("Testing text cleaning function:\n") for i, test_case in enumerate(test_cases, 1): print(f"Test {i}: {test_case['name']}") print(f"Input: '{test_case['input']}'") cleaned = tts.clean_text_for_speech(test_case['input']) print(f"Output: '{cleaned}'") # Check if cleaning was successful has_emojis = any(char for char in cleaned if ord(char) > 127 and ( 0x1F600 <= ord(char) <= 0x1F64F or # Emoticons 0x1F300 <= ord(char) <= 0x1F5FF or # Misc symbols 0x1F680 <= ord(char) <= 0x1F6FF or # Transport 0x2600 <= ord(char) <= 0x26FF # Misc symbols )) has_markdown = '**' in cleaned or '*' in cleaned or '`' in cleaned or '#' in cleaned if has_emojis: print("❌ Still contains emojis") elif has_markdown: print("❌ Still contains markdown") elif not cleaned.strip(): print("⚠️ Text became empty after cleaning") else: print("βœ… Cleaned successfully") print("-" * 50) print() if __name__ == "__main__": test_text_cleaning()