Iulian277's picture
Create normalize.py
251b871
raw
history blame contribute delete
No virus
1.65 kB
import re
import emoji
def normalize(sentence: str) -> str:
"""
This function should be used before tokenizing the input string.
Normalizes the input string in the following ways:
-> Converts from ş to ș, ţ to ț, etc.
-> Converts @mention to USER, #hashtag to HASHTAG, http... and www... to HTTPURL
-> Converts emoticons to :emoji_with_long_name:
-> Replaces :emoji_with_long_name: with emoji_with_long_name and replaces _, : and - with empty string
-> Removes multiple whitespaces with a single whitespace
Args:
`sentence` (str): The input string.
Returns:
str: The normalized string.
"""
# Make sure it's a string
sentence = str(sentence)
# Convert from ş to ș, ţ to ț, etc.
sentence = re.sub(r'ş', 'ș', sentence)
sentence = re.sub(r'Ş', 'Ș', sentence)
sentence = re.sub(r'ţ', 'ț', sentence)
sentence = re.sub(r'Ţ', 'Ț', sentence)
# Convert @mentions to USER, #hashtags to HASHTAG, http... and www... to HTTPURL
sentence = re.sub(r'@\S+', 'USER', sentence)
sentence = re.sub(r'#\S+', 'HASHTAG', sentence)
sentence = re.sub(r'http\S+', 'HTTPURL', sentence)
sentence = re.sub(r'www\S+', 'HTTPURL', sentence)
# Convert emoticons to :emoji_with_long_name:
sentence = emoji.demojize(sentence, delimiters=(' :', ': '))
# Replace :emoji_with_long_name: with emojiwithlongname
sentence = re.sub(r':\S+:', lambda x: x.group(0).replace('_', '').replace(':', '').replace('-', ''), sentence)
# Remove multiple whitespaces with a single whitespace
sentence = re.sub(r'\s+', ' ', sentence)
return sentence