| """
|
| Unit Tests for Text Preprocessing Module.
|
|
|
| Tests text cleaning, normalization, and utility functions.
|
| """
|
|
|
| import pytest
|
|
|
| from app.utils.preprocessing import (
|
| clean_text,
|
| normalize_text,
|
| convert_devanagari_digits,
|
| truncate_text,
|
| remove_urls,
|
| extract_numbers,
|
| mask_sensitive_data,
|
| )
|
|
|
|
|
| class TestCleanText:
|
| """Tests for clean_text function."""
|
|
|
| def test_empty_string(self):
|
| """Test empty string returns empty string."""
|
| assert clean_text("") == ""
|
|
|
| def test_none_returns_empty(self):
|
| """Test None or falsy value returns empty string."""
|
| assert clean_text(None) == ""
|
|
|
| def test_removes_extra_whitespace(self):
|
| """Test extra whitespace is normalized."""
|
| text = "Hello world here"
|
| result = clean_text(text)
|
| assert result == "Hello world here"
|
|
|
| def test_removes_leading_trailing_whitespace(self):
|
| """Test leading/trailing whitespace is stripped."""
|
| text = " Hello world "
|
| result = clean_text(text)
|
| assert result == "Hello world"
|
|
|
| def test_removes_control_characters(self):
|
| """Test control characters are removed."""
|
| text = "Hello\x00\x07world"
|
| result = clean_text(text)
|
| assert "\x00" not in result
|
| assert "\x07" not in result
|
| assert "Hello" in result
|
| assert "world" in result
|
|
|
| def test_preserves_normal_text(self):
|
| """Test normal text is preserved."""
|
| text = "Hello, how are you?"
|
| result = clean_text(text)
|
| assert result == text
|
|
|
| def test_normalizes_newlines_and_tabs(self):
|
| """Test newlines and tabs are normalized to spaces."""
|
| text = "Hello\nworld\there"
|
| result = clean_text(text)
|
| assert result == "Hello world here"
|
|
|
| def test_handles_unicode(self):
|
| """Test Unicode text is preserved."""
|
| text = "नमस्ते दुनिया"
|
| result = clean_text(text)
|
| assert result == text
|
|
|
|
|
| class TestNormalizeText:
|
| """Tests for normalize_text function."""
|
|
|
| def test_basic_normalization(self):
|
| """Test basic text normalization."""
|
| text = " Hello world "
|
| result = normalize_text(text)
|
| assert result == "Hello world"
|
|
|
| def test_lowercase_option(self):
|
| """Test lowercase option."""
|
| text = "Hello WORLD"
|
| result = normalize_text(text, lowercase=True)
|
| assert result == "hello world"
|
|
|
| def test_without_lowercase(self):
|
| """Test preserves case by default."""
|
| text = "Hello WORLD"
|
| result = normalize_text(text, lowercase=False)
|
| assert result == "Hello WORLD"
|
|
|
| def test_converts_devanagari_digits(self):
|
| """Test Devanagari digits are converted."""
|
| text = "Amount: ५०००"
|
| result = normalize_text(text)
|
| assert "5000" in result
|
|
|
|
|
| class TestConvertDevanagariDigits:
|
| """Tests for convert_devanagari_digits function."""
|
|
|
| def test_converts_all_digits(self):
|
| """Test all Devanagari digits are converted."""
|
| text = "०१२३४५६७८९"
|
| result = convert_devanagari_digits(text)
|
| assert result == "0123456789"
|
|
|
| def test_preserves_latin_digits(self):
|
| """Test Latin digits are preserved."""
|
| text = "123456"
|
| result = convert_devanagari_digits(text)
|
| assert result == "123456"
|
|
|
| def test_mixed_digits(self):
|
| """Test mixed Devanagari and Latin digits."""
|
| text = "Phone: ९८76543२१०"
|
| result = convert_devanagari_digits(text)
|
| assert result == "Phone: 9876543210"
|
|
|
| def test_preserves_non_digit_text(self):
|
| """Test non-digit text is preserved."""
|
| text = "नमस्ते"
|
| result = convert_devanagari_digits(text)
|
| assert result == "नमस्ते"
|
|
|
| def test_empty_string(self):
|
| """Test empty string returns empty."""
|
| assert convert_devanagari_digits("") == ""
|
|
|
| def test_phone_number_in_hindi(self):
|
| """Test phone number conversion in Hindi context."""
|
| text = "कॉल करें ९८७६५४३२१०"
|
| result = convert_devanagari_digits(text)
|
| assert "9876543210" in result
|
|
|
|
|
| class TestTruncateText:
|
| """Tests for truncate_text function."""
|
|
|
| def test_short_text_unchanged(self):
|
| """Test text shorter than limit is unchanged."""
|
| text = "Hello world"
|
| result = truncate_text(text, max_length=100)
|
| assert result == text
|
|
|
| def test_long_text_truncated(self):
|
| """Test text longer than limit is truncated."""
|
| text = "a" * 100
|
| result = truncate_text(text, max_length=50)
|
| assert len(result) == 50
|
| assert result.endswith("...")
|
|
|
| def test_custom_suffix(self):
|
| """Test custom truncation suffix."""
|
| text = "a" * 100
|
| result = truncate_text(text, max_length=50, suffix="[...]")
|
| assert result.endswith("[...]")
|
|
|
| def test_exact_length(self):
|
| """Test text at exact length is unchanged."""
|
| text = "a" * 50
|
| result = truncate_text(text, max_length=50)
|
| assert result == text
|
|
|
| def test_default_max_length(self):
|
| """Test default max_length is 5000."""
|
| text = "a" * 5000
|
| result = truncate_text(text)
|
| assert len(result) == 5000
|
|
|
|
|
| class TestRemoveUrls:
|
| """Tests for remove_urls function."""
|
|
|
| def test_removes_http_url(self):
|
| """Test HTTP URLs are removed."""
|
| text = "Visit http://example.com for more info"
|
| result = remove_urls(text)
|
| assert "http://example.com" not in result
|
| assert "Visit" in result
|
|
|
| def test_removes_https_url(self):
|
| """Test HTTPS URLs are removed."""
|
| text = "Visit https://secure.example.com for more info"
|
| result = remove_urls(text)
|
| assert "https://secure.example.com" not in result
|
|
|
| def test_removes_multiple_urls(self):
|
| """Test multiple URLs are removed."""
|
| text = "Visit http://one.com and http://two.com"
|
| result = remove_urls(text)
|
| assert "http://one.com" not in result
|
| assert "http://two.com" not in result
|
|
|
| def test_preserves_non_url_text(self):
|
| """Test non-URL text is preserved."""
|
| text = "Hello world, no URLs here"
|
| result = remove_urls(text)
|
| assert result == text
|
|
|
| def test_removes_complex_url(self):
|
| """Test complex URLs with paths are removed."""
|
| text = "Click http://example.com/path/to/page?query=value"
|
| result = remove_urls(text)
|
| assert "http://example.com" not in result
|
|
|
|
|
| class TestExtractNumbers:
|
| """Tests for extract_numbers function."""
|
|
|
| def test_extracts_single_number(self):
|
| """Test extracts single number."""
|
| text = "Amount is 5000"
|
| result = extract_numbers(text)
|
| assert "5000" in result
|
|
|
| def test_extracts_multiple_numbers(self):
|
| """Test extracts multiple numbers."""
|
| text = "Account 123456 and phone 9876543210"
|
| result = extract_numbers(text)
|
| assert "123456" in result
|
| assert "9876543210" in result
|
|
|
| def test_handles_devanagari_digits(self):
|
| """Test handles Devanagari digits."""
|
| text = "Amount ५०००"
|
| result = extract_numbers(text)
|
| assert "5000" in result
|
|
|
| def test_no_numbers(self):
|
| """Test returns empty list when no numbers."""
|
| text = "No numbers here"
|
| result = extract_numbers(text)
|
| assert result == []
|
|
|
| def test_mixed_devanagari_and_latin(self):
|
| """Test mixed digit systems."""
|
| text = "Phone ९८76543२१० account 123"
|
| result = extract_numbers(text)
|
| assert "9876543210" in result
|
| assert "123" in result
|
|
|
|
|
| class TestMaskSensitiveData:
|
| """Tests for mask_sensitive_data function."""
|
|
|
| def test_masks_upi_id(self):
|
| """Test UPI ID is masked."""
|
| text = "Send to scammer@paytm"
|
| result = mask_sensitive_data(text)
|
| assert "scammer@paytm" not in result
|
| assert "[UPI_MASKED]" in result
|
|
|
| def test_masks_bank_account(self):
|
| """Test bank account number is masked."""
|
| text = "Account: 123456789012345"
|
| result = mask_sensitive_data(text)
|
| assert "123456789012345" not in result
|
| assert "[ACCOUNT_MASKED]" in result
|
|
|
| def test_masks_phone_number(self):
|
| """Test phone number is masked."""
|
| text = "Call 9876543210"
|
| result = mask_sensitive_data(text)
|
| assert "9876543210" not in result
|
|
|
| assert "[PHONE_MASKED]" in result or "[ACCOUNT_MASKED]" in result
|
|
|
| def test_masks_phone_with_plus91(self):
|
| """Test phone with +91 prefix is masked."""
|
| text = "Call +91 9876543210"
|
| result = mask_sensitive_data(text)
|
| assert "9876543210" not in result
|
|
|
| assert "[PHONE_MASKED]" in result or "[ACCOUNT_MASKED]" in result
|
|
|
| def test_preserves_non_sensitive_text(self):
|
| """Test non-sensitive text is preserved."""
|
| text = "Hello, how are you?"
|
| result = mask_sensitive_data(text)
|
| assert result == text
|
|
|
| def test_masks_multiple_sensitive_items(self):
|
| """Test masks multiple sensitive items in one text."""
|
| text = "Send to fraud@ybl, call 9876543210, account 123456789012"
|
| result = mask_sensitive_data(text)
|
|
|
| assert "fraud@ybl" not in result
|
| assert "9876543210" not in result
|
| assert "123456789012" not in result
|
|
|
|
|
| class TestPreprocessingEdgeCases:
|
| """Edge case tests for preprocessing functions."""
|
|
|
| def test_clean_text_with_emojis(self):
|
| """Test clean_text preserves emojis."""
|
| text = "Hello 😀 world 🎉"
|
| result = clean_text(text)
|
| assert "😀" in result
|
| assert "🎉" in result
|
|
|
| def test_normalize_very_long_text(self):
|
| """Test normalize handles very long text."""
|
| text = "word " * 10000
|
| result = normalize_text(text)
|
| assert len(result) > 0
|
|
|
| def test_devanagari_mixed_with_special_chars(self):
|
| """Test Devanagari digits with special characters."""
|
| text = "Amount: ₹५,०००/-"
|
| result = convert_devanagari_digits(text)
|
| assert "5" in result
|
| assert "0" in result
|
|
|
| def test_url_with_hindi_text(self):
|
| """Test URL removal with surrounding Hindi text."""
|
| text = "यहाँ क्लिक करें http://fake.com जीतने के लिए"
|
| result = remove_urls(text)
|
| assert "http://fake.com" not in result
|
| assert "यहाँ क्लिक करें" in result
|
|
|