Run_code_api / test_tts_cleaning.py
ABAO77's picture
feat: add text cleaning functionality for TTS service to enhance input processing
3fde6b6
raw
history blame
6.08 kB
#!/usr/bin/env python3
"""
Test script for TTS text cleaning functionality
"""
import sys
import os
# Add the parent directory to sys.path to import from src
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.services.tts_service import TTSService
def test_text_cleaning():
"""Test the text cleaning functionality"""
print("🧪 Testing TTS Text Cleaning Function\n")
# Create TTS service instance (without API key for testing cleaning function only)
os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key" # Dummy key for testing
try:
tts = TTSService()
except:
# If there are import issues, create a simple test version
class TestTTSService:
def clean_text_for_speech(self, text):
import re
if not text or not isinstance(text, str):
return ""
# Remove markdown formatting
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold **text**
text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic *text*
text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text`
text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ###
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove links [text](url) -> text
# Remove emojis and special unicode characters
text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
text = re.sub(r'[\U00002600-\U000026FF]', '', text)
text = re.sub(r'[\U00002700-\U000027BF]', '', text)
text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
# Remove problematic punctuation and special characters
text = re.sub(r'[""'']', '"', text)
text = re.sub(r'[–—]', '-', text)
text = re.sub(r'[…]', '...', text)
text = re.sub(r'[«»]', '"', text)
text = re.sub(r'[‹›]', "'", text)
# Remove control characters and zero-width characters
text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text)
# Clean up extra whitespace
text = re.sub(r'\s+', ' ', text)
text = text.strip()
# Remove multiple consecutive punctuation
text = re.sub(r'\.{3,}', '...', text)
text = re.sub(r'!{2,}', '!', text)
text = re.sub(r'\?{2,}', '?', text)
# Ensure proper sentence endings
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
return text
tts = TestTTSService()
# Test cases
test_cases = [
{
"name": "Simple text",
"input": "Hello, how are you today?",
"expected_clean": True
},
{
"name": "Text with emojis",
"input": "Great job! 🎉 You're doing amazing! 🌟 Keep it up! 💪",
"expected_clean": True
},
{
"name": "Markdown formatting",
"input": "This is **bold** and this is *italic* and `code`",
"expected_clean": True
},
{
"name": "Complex markdown with links",
"input": "Check out [this link](https://example.com) and ## Header text",
"expected_clean": True
},
{
"name": "Mixed content",
"input": "🎯 **Practice Goal**: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. 📚✨",
"expected_clean": True
},
{
"name": "Smart quotes and dashes",
"input": "\"Hello world\" and 'smart quotes' with em—dash and en–dash…",
"expected_clean": True
},
{
"name": "Multiple punctuation",
"input": "Wow!!! This is amazing??? Really......",
"expected_clean": True
},
{
"name": "Real AI response",
"input": "🌟 **Excellent!** You did a great job with that conversation! Here are some tips:\n\n- Use *natural* expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\n💪 Keep practicing! 🎯",
"expected_clean": True
}
]
print("Testing text cleaning function:\n")
for i, test_case in enumerate(test_cases, 1):
print(f"Test {i}: {test_case['name']}")
print(f"Input: '{test_case['input']}'")
cleaned = tts.clean_text_for_speech(test_case['input'])
print(f"Output: '{cleaned}'")
# Check if cleaning was successful
has_emojis = any(char for char in cleaned if ord(char) > 127 and (
0x1F600 <= ord(char) <= 0x1F64F or # Emoticons
0x1F300 <= ord(char) <= 0x1F5FF or # Misc symbols
0x1F680 <= ord(char) <= 0x1F6FF or # Transport
0x2600 <= ord(char) <= 0x26FF # Misc symbols
))
has_markdown = '**' in cleaned or '*' in cleaned or '`' in cleaned or '#' in cleaned
if has_emojis:
print("❌ Still contains emojis")
elif has_markdown:
print("❌ Still contains markdown")
elif not cleaned.strip():
print("⚠️ Text became empty after cleaning")
else:
print("✅ Cleaned successfully")
print("-" * 50)
print()
if __name__ == "__main__":
test_text_cleaning()