Spaces:
Build error
Build error
File size: 887 Bytes
4c8fe65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import re
import logging
TWITTER_USER_RE = re.compile(r'@([A-Za-z0-9_]+)')
TWITTER_URL_RE = re.compile(r'https?:\/\/\S+')
TWITTER_USER_PH = '@USER' + ('_' * 15)
TWITTER_URL_PH = 'HTTPURL' + ('_' * 93)
def normalize_user_url(text: str, lower: bool = False) -> str:
text = TWITTER_USER_RE.sub(lambda m: m.group().replace(m.group(), TWITTER_USER_PH[:len(m.group())], 1), text)
text = TWITTER_URL_RE.sub(lambda m: m.group().replace(m.group(), TWITTER_URL_PH[:len(m.group())], 1), text)
text = text.lower() if lower else text
text = text.replace("\"", " ")
text = text.replace("\n", " ")
return text
def normalize_tweets(data_tweets):
texts = []
for json_tweet in data_tweets:
text = json_tweet.full_text.lower()
text = normalize_user_url(text)
texts.append(text)
logging.info("Loaded %s texts", len(texts))
return texts
|