Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test script for TTS text cleaning functionality | |
| """ | |
| import sys | |
| import os | |
| # Add the parent directory to sys.path to import from src | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from src.services.tts_service import TTSService | |
| def test_text_cleaning(): | |
| """Test the text cleaning functionality""" | |
| print("🧪 Testing TTS Text Cleaning Function\n") | |
| # Create TTS service instance (without API key for testing cleaning function only) | |
| os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key" # Dummy key for testing | |
| try: | |
| tts = TTSService() | |
| except: | |
| # If there are import issues, create a simple test version | |
| class TestTTSService: | |
| def clean_text_for_speech(self, text): | |
| import re | |
| if not text or not isinstance(text, str): | |
| return "" | |
| # Remove markdown formatting | |
| text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold **text** | |
| text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic *text* | |
| text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text` | |
| text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ### | |
| text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove links [text](url) -> text | |
| # Remove emojis and special unicode characters | |
| text = re.sub(r'[\U0001F600-\U0001F64F]', '', text) | |
| text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text) | |
| text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text) | |
| text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text) | |
| text = re.sub(r'[\U00002600-\U000026FF]', '', text) | |
| text = re.sub(r'[\U00002700-\U000027BF]', '', text) | |
| text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text) | |
| text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text) | |
| # Remove problematic punctuation and special characters | |
| text = re.sub(r'[""'']', '"', text) | |
| text = re.sub(r'[–—]', '-', text) | |
| text = re.sub(r'[…]', '...', text) | |
| text = re.sub(r'[«»]', '"', text) | |
| text = re.sub(r'[‹›]', "'", text) | |
| # Remove control characters and zero-width characters | |
| text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text) | |
| text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text) | |
| # Clean up extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| text = text.strip() | |
| # Remove multiple consecutive punctuation | |
| text = re.sub(r'\.{3,}', '...', text) | |
| text = re.sub(r'!{2,}', '!', text) | |
| text = re.sub(r'\?{2,}', '?', text) | |
| # Ensure proper sentence endings | |
| text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) | |
| return text | |
| tts = TestTTSService() | |
| # Test cases | |
| test_cases = [ | |
| { | |
| "name": "Simple text", | |
| "input": "Hello, how are you today?", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Text with emojis", | |
| "input": "Great job! 🎉 You're doing amazing! 🌟 Keep it up! 💪", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Markdown formatting", | |
| "input": "This is **bold** and this is *italic* and `code`", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Complex markdown with links", | |
| "input": "Check out [this link](https://example.com) and ## Header text", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Mixed content", | |
| "input": "🎯 **Practice Goal**: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. 📚✨", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Smart quotes and dashes", | |
| "input": "\"Hello world\" and 'smart quotes' with em—dash and en–dash…", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Multiple punctuation", | |
| "input": "Wow!!! This is amazing??? Really......", | |
| "expected_clean": True | |
| }, | |
| { | |
| "name": "Real AI response", | |
| "input": "🌟 **Excellent!** You did a great job with that conversation! Here are some tips:\n\n- Use *natural* expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\n💪 Keep practicing! 🎯", | |
| "expected_clean": True | |
| } | |
| ] | |
| print("Testing text cleaning function:\n") | |
| for i, test_case in enumerate(test_cases, 1): | |
| print(f"Test {i}: {test_case['name']}") | |
| print(f"Input: '{test_case['input']}'") | |
| cleaned = tts.clean_text_for_speech(test_case['input']) | |
| print(f"Output: '{cleaned}'") | |
| # Check if cleaning was successful | |
| has_emojis = any(char for char in cleaned if ord(char) > 127 and ( | |
| 0x1F600 <= ord(char) <= 0x1F64F or # Emoticons | |
| 0x1F300 <= ord(char) <= 0x1F5FF or # Misc symbols | |
| 0x1F680 <= ord(char) <= 0x1F6FF or # Transport | |
| 0x2600 <= ord(char) <= 0x26FF # Misc symbols | |
| )) | |
| has_markdown = '**' in cleaned or '*' in cleaned or '`' in cleaned or '#' in cleaned | |
| if has_emojis: | |
| print("❌ Still contains emojis") | |
| elif has_markdown: | |
| print("❌ Still contains markdown") | |
| elif not cleaned.strip(): | |
| print("⚠️ Text became empty after cleaning") | |
| else: | |
| print("✅ Cleaned successfully") | |
| print("-" * 50) | |
| print() | |
| if __name__ == "__main__": | |
| test_text_cleaning() | |