Spaces:
Sleeping
Sleeping
File size: 6,075 Bytes
3fde6b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
#!/usr/bin/env python3
"""
Test script for TTS text cleaning functionality
"""
import sys
import os
# Add the parent directory to sys.path to import from src
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.services.tts_service import TTSService
def test_text_cleaning():
"""Test the text cleaning functionality"""
print("🧪 Testing TTS Text Cleaning Function\n")
# Create TTS service instance (without API key for testing cleaning function only)
os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key" # Dummy key for testing
try:
tts = TTSService()
except:
# If there are import issues, create a simple test version
class TestTTSService:
def clean_text_for_speech(self, text):
import re
if not text or not isinstance(text, str):
return ""
# Remove markdown formatting
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold **text**
text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic *text*
text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text`
text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ###
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove links [text](url) -> text
# Remove emojis and special unicode characters
text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
text = re.sub(r'[\U00002600-\U000026FF]', '', text)
text = re.sub(r'[\U00002700-\U000027BF]', '', text)
text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
# Remove problematic punctuation and special characters
text = re.sub(r'[""'']', '"', text)
text = re.sub(r'[–—]', '-', text)
text = re.sub(r'[…]', '...', text)
text = re.sub(r'[«»]', '"', text)
text = re.sub(r'[‹›]', "'", text)
# Remove control characters and zero-width characters
text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text)
# Clean up extra whitespace
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Remove multiple consecutive punctuation
text = re.sub(r'\.{3,}', '...', text)
text = re.sub(r'!{2,}', '!', text)
text = re.sub(r'\?{2,}', '?', text)
# Ensure proper sentence endings
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
return text
tts = TestTTSService()
# Test cases
test_cases = [
{
"name": "Simple text",
"input": "Hello, how are you today?",
"expected_clean": True
},
{
"name": "Text with emojis",
"input": "Great job! 🎉 You're doing amazing! 🌟 Keep it up! 💪",
"expected_clean": True
},
{
"name": "Markdown formatting",
"input": "This is **bold** and this is *italic* and `code`",
"expected_clean": True
},
{
"name": "Complex markdown with links",
"input": "Check out [this link](https://example.com) and ## Header text",
"expected_clean": True
},
{
"name": "Mixed content",
"input": "🎯 **Practice Goal**: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. 📚✨",
"expected_clean": True
},
{
"name": "Smart quotes and dashes",
"input": "\"Hello world\" and 'smart quotes' with em—dash and en–dash…",
"expected_clean": True
},
{
"name": "Multiple punctuation",
"input": "Wow!!! This is amazing??? Really......",
"expected_clean": True
},
{
"name": "Real AI response",
"input": "🌟 **Excellent!** You did a great job with that conversation! Here are some tips:\n\n- Use *natural* expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\n💪 Keep practicing! 🎯",
"expected_clean": True
}
]
print("Testing text cleaning function:\n")
for i, test_case in enumerate(test_cases, 1):
print(f"Test {i}: {test_case['name']}")
print(f"Input: '{test_case['input']}'")
cleaned = tts.clean_text_for_speech(test_case['input'])
print(f"Output: '{cleaned}'")
# Check if cleaning was successful
has_emojis = any(char for char in cleaned if ord(char) > 127 and (
0x1F600 <= ord(char) <= 0x1F64F or # Emoticons
0x1F300 <= ord(char) <= 0x1F5FF or # Misc symbols
0x1F680 <= ord(char) <= 0x1F6FF or # Transport
0x2600 <= ord(char) <= 0x26FF # Misc symbols
))
has_markdown = '**' in cleaned or '*' in cleaned or '`' in cleaned or '#' in cleaned
if has_emojis:
print("❌ Still contains emojis")
elif has_markdown:
print("❌ Still contains markdown")
elif not cleaned.strip():
print("⚠️ Text became empty after cleaning")
else:
print("✅ Cleaned successfully")
print("-" * 50)
print()
if __name__ == "__main__":
test_text_cleaning()
|