SpamShield / utils.py
M-Arjun's picture
0.4 and 0.4 - lite
0ec6609 verified
import re
import string
def preprocess_text(text: str) -> str:
"""
Lightweight preprocessing:
- lowercase
- URL normalization
- remove excessive repeated characters
- strip punctuation
"""
if not text:
return ""
# Lowercase
text = text.lower()
# URL normalization
text = re.sub(r'https?://\S+|www\.\S+', ' [URL] ', text)
# Remove excessive repeated characters (e.g., "freeeeee" -> "free")
text = re.sub(r'(.)\1{2,}', r'\1', text)
# Handle spaced out characters (e.g., "F R E E" -> "FREE")
# Only if they are single characters separated by spaces, and more than 2 in a row
text = re.sub(r'\b(\w\s){2,}\w\b', lambda m: m.group().replace(' ', ''), text)
# Strip punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text