File size: 6,075 Bytes
3fde6b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
"""
Test script for TTS text cleaning functionality
"""

import sys
import os

# Add the parent directory to sys.path to import from src
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from src.services.tts_service import TTSService

def test_text_cleaning():
    """Test the text cleaning functionality"""
    
    print("🧪 Testing TTS Text Cleaning Function\n")
    
    # Create TTS service instance (without API key for testing cleaning function only)
    os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key"  # Dummy key for testing
    try:
        tts = TTSService()
    except:
        # If there are import issues, create a simple test version
        class TestTTSService:
            def clean_text_for_speech(self, text):
                import re
                if not text or not isinstance(text, str):
                    return ""
                
                # Remove markdown formatting
                text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Remove bold **text**
                text = re.sub(r'\*(.*?)\*', r'\1', text)      # Remove italic *text*
                text = re.sub(r'`(.*?)`', r'\1', text)        # Remove code `text`
                text = re.sub(r'#{1,6}\s', '', text)          # Remove headers # ## ###
                text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)  # Remove links [text](url) -> text
                
                # Remove emojis and special unicode characters
                text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
                text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
                text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
                text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
                text = re.sub(r'[\U00002600-\U000026FF]', '', text)
                text = re.sub(r'[\U00002700-\U000027BF]', '', text)
                text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
                text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
                
                # Remove problematic punctuation and special characters
                text = re.sub(r'[""'']', '"', text)
                text = re.sub(r'[–—]', '-', text)
                text = re.sub(r'[…]', '...', text)
                text = re.sub(r'[«»]', '"', text)
                text = re.sub(r'[‹›]', "'", text)
                
                # Remove control characters and zero-width characters
                text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
                text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text)
                
                # Clean up extra whitespace
                text = re.sub(r'\s+', ' ', text)
                text = text.strip()
                
                # Remove multiple consecutive punctuation
                text = re.sub(r'\.{3,}', '...', text)
                text = re.sub(r'!{2,}', '!', text)
                text = re.sub(r'\?{2,}', '?', text)
                
                # Ensure proper sentence endings
                text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
                
                return text
        
        tts = TestTTSService()
    
    # Test cases
    test_cases = [
        {
            "name": "Simple text",
            "input": "Hello, how are you today?",
            "expected_clean": True
        },
        {
            "name": "Text with emojis",
            "input": "Great job! 🎉 You're doing amazing! 🌟 Keep it up! 💪",
            "expected_clean": True
        },
        {
            "name": "Markdown formatting",
            "input": "This is **bold** and this is *italic* and `code`",
            "expected_clean": True
        },
        {
            "name": "Complex markdown with links",
            "input": "Check out [this link](https://example.com) and ## Header text",
            "expected_clean": True
        },
        {
            "name": "Mixed content",
            "input": "🎯 **Practice Goal**: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. 📚✨",
            "expected_clean": True
        },
        {
            "name": "Smart quotes and dashes",
            "input": "\"Hello world\" and 'smart quotes' with em—dash and en–dash…",
            "expected_clean": True
        },
        {
            "name": "Multiple punctuation",
            "input": "Wow!!! This is amazing??? Really......",
            "expected_clean": True
        },
        {
            "name": "Real AI response",
            "input": "🌟 **Excellent!** You did a great job with that conversation! Here are some tips:\n\n- Use *natural* expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\n💪 Keep practicing! 🎯",
            "expected_clean": True
        }
    ]
    
    print("Testing text cleaning function:\n")
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"Test {i}: {test_case['name']}")
        print(f"Input:  '{test_case['input']}'")
        
        cleaned = tts.clean_text_for_speech(test_case['input'])
        print(f"Output: '{cleaned}'")
        
        # Check if cleaning was successful
        has_emojis = any(char for char in cleaned if ord(char) > 127 and (
            0x1F600 <= ord(char) <= 0x1F64F or  # Emoticons
            0x1F300 <= ord(char) <= 0x1F5FF or  # Misc symbols
            0x1F680 <= ord(char) <= 0x1F6FF or  # Transport
            0x2600 <= ord(char) <= 0x26FF       # Misc symbols
        ))
        
        has_markdown = '**' in cleaned or '*' in cleaned or '`' in cleaned or '#' in cleaned
        
        if has_emojis:
            print("❌ Still contains emojis")
        elif has_markdown:
            print("❌ Still contains markdown")
        elif not cleaned.strip():
            print("⚠️  Text became empty after cleaning")
        else:
            print("✅ Cleaned successfully")
        
        print("-" * 50)
        print()

if __name__ == "__main__":
    test_text_cleaning()