| import re |
| import string |
|
|
| def preprocess_text(text: str) -> str: |
| """ |
| Lightweight preprocessing: |
| - lowercase |
| - URL normalization |
| - remove excessive repeated characters |
| - strip punctuation |
| """ |
| if not text: |
| return "" |
| |
| |
| text = text.lower() |
| |
| |
| text = re.sub(r'https?://\S+|www\.\S+', ' [URL] ', text) |
| |
| |
| text = re.sub(r'(.)\1{2,}', r'\1', text) |
| |
| |
| |
| text = re.sub(r'\b(\w\s){2,}\w\b', lambda m: m.group().replace(' ', ''), text) |
| |
| |
| text = text.translate(str.maketrans('', '', string.punctuation)) |
| |
| |
| text = re.sub(r'\s+', ' ', text).strip() |
| |
| return text |
|
|