FredZhang7
commited on
Commit
•
71aeaca
1
Parent(s):
767bf1b
Added missing step
Browse files- preprocess.py +10 -7
preprocess.py
CHANGED
@@ -1,20 +1,23 @@
|
|
1 |
import re
|
2 |
|
3 |
-
def
|
4 |
-
#
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
# Replace " , " with an empty space
|
8 |
-
|
9 |
|
10 |
# Remove any trailing commas
|
11 |
-
|
12 |
|
13 |
# Strip spaces
|
14 |
-
|
15 |
|
16 |
# Remove any usernames
|
17 |
-
words =
|
18 |
result = []
|
19 |
for word in words:
|
20 |
word = word.strip()
|
|
|
1 |
import re
|
2 |
|
3 |
+
def clean_tags(tags):
|
4 |
+
# Make tags more human readable
|
5 |
+
tags = tags.replace(' ', ', ').replace('_', ' ')
|
6 |
+
|
7 |
+
# Remove "!", "?", ".", "(", ")" from the tags
|
8 |
+
tags = re.sub(r"[!.?()]", "", tags)
|
9 |
|
10 |
# Replace " , " with an empty space
|
11 |
+
tags = re.sub(r" , ", " ", tags)
|
12 |
|
13 |
# Remove any trailing commas
|
14 |
+
tags = re.sub(r"^,|,$", "", tags)
|
15 |
|
16 |
# Strip spaces
|
17 |
+
tags = tags.strip()
|
18 |
|
19 |
# Remove any usernames
|
20 |
+
words = tags.split(", ")
|
21 |
result = []
|
22 |
for word in words:
|
23 |
word = word.strip()
|