Spaces:
Build error
Build error
import re | |
class TextPreprocess(): | |
def __init__(self, teencode_dir="./storage/teencode.txt"): | |
self.get_teencode(teencode_dir) | |
def get_teencode(self, teencode_dir): | |
with open(teencode_dir, "r", encoding="utf-8") as f: | |
teencode_original = f.readlines() | |
teencode_json = {} | |
for teencode in teencode_original: | |
key, value = teencode.split("\t") | |
value = value.replace("\n", "") | |
teencode_json[key] = value | |
self.teencode_json = teencode_json | |
def teencode_normalize(self, text): | |
text_split = text.split() | |
return " ".join([self.teencode_json.get(txt, txt) for txt in text_split]) | |
def clean_text(self, text): | |
# Xóa hashtag (dấu #) | |
text = re.sub(r'#\w+', '', text) | |
# Xóa liên kết (URL) | |
text = re.sub(r'http\S+', '', text) | |
# Xóa các ký tự số | |
text = re.sub(r'\d+', '', text) | |
# Xóa ký tự đặc biệt | |
text = re.sub(r'[^\w\s]', '', text) | |
text = " ".join(text.split()) | |
text = text.lower() | |
return text | |
def preprocess(self, text): | |
cleaned_text = self.clean_text(text) | |
cleaned_text = self.teencode_normalize(cleaned_text) | |
return cleaned_text |