truong-xuan-linh's picture
init
4589b64
raw
history blame contribute delete
No virus
1.17 kB
import re
class TextPreprocess():
def __init__(self, teencode_dir="./storage/teencode.txt"):
self.get_teencode(teencode_dir)
def get_teencode(self, teencode_dir):
with open(teencode_dir, "r", encoding="utf-8") as f:
teencode_original = f.readlines()
teencode_json = {}
for teencode in teencode_original:
key, value = teencode.split("\t")
value = value.replace("\n", "")
teencode_json[key] = value
self.teencode_json = teencode_json
def teencode_normalize(self, text):
text_split = text.split()
return " ".join([self.teencode_json.get(txt, txt) for txt in text_split])
def clean_text(self, text):
# Xóa hashtag (dấu #)
text = re.sub(r'#\w+', '', text)
# Xóa liên kết (URL)
text = re.sub(r'http\S+', '', text)
# Xóa các ký tự số
text = re.sub(r'\d+', '', text)
# Xóa ký tự đặc biệt
text = re.sub(r'[^\w\s]', '', text)
text = " ".join(text.split())
text = text.lower()
return text
def preprocess(self, text):
cleaned_text = self.clean_text(text)
cleaned_text = self.teencode_normalize(cleaned_text)
return cleaned_text