File size: 887 Bytes
4c8fe65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re
import logging

TWITTER_USER_RE = re.compile(r'@([A-Za-z0-9_]+)')
TWITTER_URL_RE = re.compile(r'https?:\/\/\S+')
TWITTER_USER_PH = '@USER' + ('_' * 15)
TWITTER_URL_PH = 'HTTPURL' + ('_' * 93)


def normalize_user_url(text: str, lower: bool = False) -> str:
    text = TWITTER_USER_RE.sub(lambda m: m.group().replace(m.group(), TWITTER_USER_PH[:len(m.group())], 1), text)
    text = TWITTER_URL_RE.sub(lambda m: m.group().replace(m.group(), TWITTER_URL_PH[:len(m.group())], 1), text)
    text = text.lower() if lower else text
    text = text.replace("\"", " ")
    text = text.replace("\n", " ")
    return text


def normalize_tweets(data_tweets):
    texts = []
    for json_tweet in data_tweets:
        text = json_tweet.full_text.lower()

        text = normalize_user_url(text)
        texts.append(text)
    logging.info("Loaded %s texts", len(texts))
    return texts