Spaces:
Build error
Build error
import re | |
import logging | |
TWITTER_USER_RE = re.compile(r'@([A-Za-z0-9_]+)') | |
TWITTER_URL_RE = re.compile(r'https?:\/\/\S+') | |
TWITTER_USER_PH = '@USER' + ('_' * 15) | |
TWITTER_URL_PH = 'HTTPURL' + ('_' * 93) | |
def normalize_user_url(text: str, lower: bool = False) -> str: | |
text = TWITTER_USER_RE.sub(lambda m: m.group().replace(m.group(), TWITTER_USER_PH[:len(m.group())], 1), text) | |
text = TWITTER_URL_RE.sub(lambda m: m.group().replace(m.group(), TWITTER_URL_PH[:len(m.group())], 1), text) | |
text = text.lower() if lower else text | |
text = text.replace("\"", " ") | |
text = text.replace("\n", " ") | |
return text | |
def normalize_tweets(data_tweets): | |
texts = [] | |
for json_tweet in data_tweets: | |
text = json_tweet.full_text.lower() | |
text = normalize_user_url(text) | |
texts.append(text) | |
logging.info("Loaded %s texts", len(texts)) | |
return texts | |