thamani commited on
Commit
749c265
·
verified ·
1 Parent(s): 2567f25

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +16 -0
utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_text(text):
4
+ # Remove HTML tags
5
+ text = re.sub(r'<[^>]*?>', '', text)
6
+ # Remove URLs
7
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
8
+ # Remove special characters
9
+ text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
10
+ # Replace multiple spaces with a single space
11
+ text = re.sub(r'\s{2,}', ' ', text)
12
+ # Trim leading and trailing whitespace
13
+ text = text.strip()
14
+ # Remove extra whitespace
15
+ text = ' '.join(text.split())
16
+ return text