Spaces:
Build error
Build error
Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def clean_text(text):
|
4 |
+
# Remove HTML tags
|
5 |
+
text = re.sub(r'<[^>]*?>', '', text)
|
6 |
+
# Remove URLs
|
7 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
8 |
+
# Remove special characters
|
9 |
+
text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
|
10 |
+
# Replace multiple spaces with a single space
|
11 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
12 |
+
# Trim leading and trailing whitespace
|
13 |
+
text = text.strip()
|
14 |
+
# Remove extra whitespace
|
15 |
+
text = ' '.join(text.split())
|
16 |
+
return text
|