Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

Run_code_api / test_tts_cleaning.py

ABAO77

feat: add text cleaning functionality for TTS service to enhance input processing

3fde6b6 about 2 months ago

raw

history blame

6.08 kB

	#!/usr/bin/env python3
	"""
	Test script for TTS text cleaning functionality
	"""

	import sys
	import os

	# Add the parent directory to sys.path to import from src
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from src.services.tts_service import TTSService

	def test_text_cleaning():
	"""Test the text cleaning functionality"""

	print("🧪 Testing TTS Text Cleaning Function\n")

	# Create TTS service instance (without API key for testing cleaning function only)
	os.environ["YOUR_DEEPGRAM_API_KEY"] = "test_key" # Dummy key for testing
	try:
	tts = TTSService()
	except:
	# If there are import issues, create a simple test version
	class TestTTSService:
	def clean_text_for_speech(self, text):
	import re
	if not text or not isinstance(text, str):
	return ""

	# Remove markdown formatting
	text = re.sub(r'\\(.?)\\', r'\1', text) # Remove bold text*
	text = re.sub(r'\(.?)\', r'\1', text) # Remove italic text*
	text = re.sub(r'`(.*?)`', r'\1', text) # Remove code `text`
	text = re.sub(r'#{1,6}\s', '', text) # Remove headers # ## ###
	text = re.sub(r'\[(.?)\]\(.?\)', r'\1', text) # Remove links [text](url) -> text

	# Remove emojis and special unicode characters
	text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)
	text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)
	text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)
	text = re.sub(r'[\U0001F1E0-\U0001F1FF]', '', text)
	text = re.sub(r'[\U00002600-\U000026FF]', '', text)
	text = re.sub(r'[\U00002700-\U000027BF]', '', text)
	text = re.sub(r'[\U0000FE00-\U0000FE0F]', '', text)
	text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)

	# Remove problematic punctuation and special characters
	text = re.sub(r'[""'']', '"', text)
	text = re.sub(r'[–—]', '-', text)
	text = re.sub(r'[…]', '...', text)
	text = re.sub(r'[«»]', '"', text)
	text = re.sub(r'[‹›]', "'", text)

	# Remove control characters and zero-width characters
	text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
	text = re.sub(r'[\u0000-\u001F\u007F-\u009F]', '', text)

	# Clean up extra whitespace
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()

	# Remove multiple consecutive punctuation
	text = re.sub(r'\.{3,}', '...', text)
	text = re.sub(r'!{2,}', '!', text)
	text = re.sub(r'\?{2,}', '?', text)

	# Ensure proper sentence endings
	text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)

	return text

	tts = TestTTSService()

	# Test cases
	test_cases = [
	{
	"name": "Simple text",
	"input": "Hello, how are you today?",
	"expected_clean": True
	},
	{
	"name": "Text with emojis",
	"input": "Great job! 🎉 You're doing amazing! 🌟 Keep it up! 💪",
	"expected_clean": True
	},
	{
	"name": "Markdown formatting",
	"input": "This is bold and this is italic and `code`",
	"expected_clean": True
	},
	{
	"name": "Complex markdown with links",
	"input": "Check out [this link](https://example.com) and ## Header text",
	"expected_clean": True
	},
	{
	"name": "Mixed content",
	"input": "🎯 Practice Goal: Learn English conversation skills! Visit [our website](https://wise.com) for more tips. 📚✨",
	"expected_clean": True
	},
	{
	"name": "Smart quotes and dashes",
	"input": "\"Hello world\" and 'smart quotes' with em—dash and en–dash…",
	"expected_clean": True
	},
	{
	"name": "Multiple punctuation",
	"input": "Wow!!! This is amazing??? Really......",
	"expected_clean": True
	},
	{
	"name": "Real AI response",
	"input": "🌟 Excellent! You did a great job with that conversation! Here are some tips:\n\n- Use natural expressions\n- Practice `daily`\n- Visit [practice site](https://example.com)\n\n💪 Keep practicing! 🎯",
	"expected_clean": True
	}
	]

	print("Testing text cleaning function:\n")

	for i, test_case in enumerate(test_cases, 1):
	print(f"Test {i}: {test_case['name']}")
	print(f"Input: '{test_case['input']}'")

	cleaned = tts.clean_text_for_speech(test_case['input'])
	print(f"Output: '{cleaned}'")

	# Check if cleaning was successful
	has_emojis = any(char for char in cleaned if ord(char) > 127 and (
	0x1F600 <= ord(char) <= 0x1F64F or # Emoticons
	0x1F300 <= ord(char) <= 0x1F5FF or # Misc symbols
	0x1F680 <= ord(char) <= 0x1F6FF or # Transport
	0x2600 <= ord(char) <= 0x26FF # Misc symbols
	))

	has_markdown = '*' in cleaned or '' in cleaned or '`' in cleaned or '#' in cleaned

	if has_emojis:
	print("❌ Still contains emojis")
	elif has_markdown:
	print("❌ Still contains markdown")
	elif not cleaned.strip():
	print("⚠️ Text became empty after cleaning")
	else:
	print("✅ Cleaned successfully")

	print("-" * 50)
	print()

	if __name__ == "__main__":
	test_text_cleaning()