Spaces:

ABAO77
/

Run_code_api

Sleeping

File size: 6,075 Bytes

3fde6b6

#!/usr/bin/env python3
"""
Test script for TTS text cleaning functionality
"""

import sys
import os

# Add the parent directory to sys.path to import from src
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from src.services.tts_service import TTSService

def test_text_cleaning():
    """Test the text cleaning functionality"""
    
    print("🧪 Testing TTS Text Cleaning Function\n")
    
    # Create TTS service instance (without API key for testing cleaning function only)
    os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key"  # Dummy key for testing
    try:
        tts = TTSService()
    except:
        # If there are import issues, create a simple test version
        class TestTTSService:
            def clean_text_for_speech(self, text):
                import re
                if not text or not isinstance(text, str):
                    return ""
                
                # Remove markdown formatting
                text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Remove bold **text**
                text = re.sub(r'\*(.*?)\*', r'\1', text)      # Remove italic *text*
                text = re.sub(r'`(.*?)`', r'\1', text)        # Remove code `text`
                text = re.sub(r'#{1,6}\s', '', text)          # Remove headers # ## ###
                text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)  # Remove links [text](url) -> text
                
                # Remove emojis and special unicode characters
                text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
                text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
                text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
                text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
                text = re.sub(r'[\U00002600-\U000026FF]', '', text)
                text = re.sub(r'[\U00002700-\U000027BF]', '', text)
                text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
                text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)
                
                # Remove problematic punctuation and special characters
                text = re.sub(r'[""'']', '"', text)
                text = re.sub(r'[–—]', '-', text)
                text = re.sub(r'[…]', '...', text)
                text = re.sub(r'[«»]', '"', text)
                text = re.sub(r'[‹›]', "'", text)
                
                # Remove control characters and zero-width characters
                text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
                text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text)
                
                # Clean up extra whitespace
                text = re.sub(r'\s+', ' ', text)
                text = text.strip()
                
                # Remove multiple consecutive punctuation
                text = re.sub(r'\.{3,}', '...', text)
                text = re.sub(r'!{2,}', '!', text)
                text = re.sub(r'\?{2,}', '?', text)
                
                # Ensure proper sentence endings
                text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
                
                return text
        
        tts = TestTTSService()
    
    # Test cases
    test_cases = [
        {
            "name": "Simple text",
            "input": "Hello, how are you today?",
            "expected_clean": True
        },
        {
            "name": "Text with emojis",
            "input": "Great job! 🎉 You're doing amazing! 🌟 Keep it up! 💪",
            "expected_clean": True
        },
        {
            "name": "Markdown formatting",
            "input": "This is **bold** and this is *italic* and `code`",
            "expected_clean": True
        },
        {
            "name": "Complex markdown with links",
            "input": "Check out [this link](https://example.com) and ## Header text",
            "expected_clean": True
        },
        {
            "name": "Mixed content",
            "input": "🎯 **Practice Goal**: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. 📚✨",
            "expected_clean": True
        },
        {
            "name": "Smart quotes and dashes",
            "input": "\"Hello world\" and 'smart quotes' with em—dash and en–dash…",
            "expected_clean": True
        },
        {
            "name": "Multiple punctuation",
            "input": "Wow!!! This is amazing??? Really......",
            "expected_clean": True
        },
        {
            "name": "Real AI response",
            "input": "🌟 **Excellent!** You did a great job with that conversation! Here are some tips:\n\n- Use *natural* expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\n💪 Keep practicing! 🎯",
            "expected_clean": True
        }
    ]
    
    print("Testing text cleaning function:\n")
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"Test {i}: {test_case['name']}")
        print(f"Input:  '{test_case['input']}'")
        
        cleaned = tts.clean_text_for_speech(test_case['input'])
        print(f"Output: '{cleaned}'")
        
        # Check if cleaning was successful
        has_emojis = any(char for char in cleaned if ord(char) > 127 and (
            0x1F600 <= ord(char) <= 0x1F64F or  # Emoticons
            0x1F300 <= ord(char) <= 0x1F5FF or  # Misc symbols
            0x1F680 <= ord(char) <= 0x1F6FF or  # Transport
            0x2600 <= ord(char) <= 0x26FF       # Misc symbols
        ))
        
        has_markdown = '**' in cleaned or '*' in cleaned or '`' in cleaned or '#' in cleaned
        
        if has_emojis:
            print("❌ Still contains emojis")
        elif has_markdown:
            print("❌ Still contains markdown")
        elif not cleaned.strip():
            print("⚠️  Text became empty after cleaning")
        else:
            print("✅ Cleaned successfully")
        
        print("-" * 50)
        print()

if __name__ == "__main__":
    test_text_cleaning()