some modification in preprocessing/urls removing
Browse files- src/data_utils.py +4 -9
src/data_utils.py
CHANGED
@@ -46,19 +46,14 @@ def clean_url(text):
|
|
46 |
result = result.replace(' ', '')
|
47 |
result = result.split(':')
|
48 |
for phrase in result:
|
49 |
-
p = phrase
|
50 |
-
|
51 |
-
|
52 |
-
if ('https :' + p) or ('https:' + p) in text:
|
53 |
text = text.replace('https :' + p, '')
|
54 |
-
|
55 |
-
elif ('http :' + p) or ('http:' + p) in text:
|
56 |
text = text.replace('http :' + p, '')
|
57 |
-
text = text.replace('http:' + p, '')
|
58 |
elif '@' in p:
|
59 |
if p in text:
|
60 |
text = text.replace(p, '')
|
61 |
-
else:
|
62 |
-
text = text.replace(p, "")
|
63 |
|
64 |
return text
|
|
|
46 |
result = result.replace(' ', '')
|
47 |
result = result.split(':')
|
48 |
for phrase in result:
|
49 |
+
p = phrase
|
50 |
+
if '/ /' in p:
|
51 |
+
if ('https :' + p) in text:
|
|
|
52 |
text = text.replace('https :' + p, '')
|
53 |
+
elif ('http :' + p) in text:
|
|
|
54 |
text = text.replace('http :' + p, '')
|
|
|
55 |
elif '@' in p:
|
56 |
if p in text:
|
57 |
text = text.replace(p, '')
|
|
|
|
|
58 |
|
59 |
return text
|