|
import re |
|
import emoji |
|
|
|
def normalize(sentence: str) -> str: |
|
""" |
|
This function should be used before tokenizing the input string. |
|
|
|
Normalizes the input string in the following ways: |
|
-> Converts from ş to ș, ţ to ț, etc. |
|
-> Converts @mention to USER, #hashtag to HASHTAG, http... and www... to HTTPURL |
|
-> Converts emoticons to :emoji_with_long_name: |
|
-> Replaces :emoji_with_long_name: with emoji_with_long_name and replaces _, : and - with empty string |
|
-> Removes multiple whitespaces with a single whitespace |
|
Args: |
|
`sentence` (str): The input string. |
|
Returns: |
|
str: The normalized string. |
|
""" |
|
|
|
|
|
sentence = str(sentence) |
|
|
|
|
|
sentence = re.sub(r'ş', 'ș', sentence) |
|
sentence = re.sub(r'Ş', 'Ș', sentence) |
|
sentence = re.sub(r'ţ', 'ț', sentence) |
|
sentence = re.sub(r'Ţ', 'Ț', sentence) |
|
|
|
|
|
sentence = re.sub(r'@\S+', 'USER', sentence) |
|
sentence = re.sub(r'#\S+', 'HASHTAG', sentence) |
|
sentence = re.sub(r'http\S+', 'HTTPURL', sentence) |
|
sentence = re.sub(r'www\S+', 'HTTPURL', sentence) |
|
|
|
|
|
sentence = emoji.demojize(sentence, delimiters=(' :', ': ')) |
|
|
|
|
|
sentence = re.sub(r':\S+:', lambda x: x.group(0).replace('_', '').replace(':', '').replace('-', ''), sentence) |
|
|
|
|
|
sentence = re.sub(r'\s+', ' ', sentence) |
|
|
|
return sentence |