Spaces:
Sleeping
Sleeping
import re | |
import string | |
import nltk | |
import pymorphy2 | |
from nltk.tokenize import word_tokenize | |
nltk.download("punkt") | |
def clean_text(text: str) -> str: | |
text = text.lower() | |
text = re.sub(r"\w*(\w)\1{2,}\w*", "", text) | |
text = re.sub(r"\d+\w*", "", text) | |
text = re.sub(r"\[.*?\]", "", text) | |
text = text.translate(str.maketrans("", "", string.punctuation)) | |
return text | |
def lemmize_and_tokenize_text(text: str) -> list[str]: | |
morph = pymorphy2.MorphAnalyzer() | |
tokens = word_tokenize(text) | |
lemmas = [morph.parse(token)[0].normal_form for token in tokens] | |
return lemmas | |
def data_preprocessing(text: str) -> list[str]: | |
cleaned_text = clean_text(text) | |
lemmized_text = lemmize_and_tokenize_text(cleaned_text) | |
return lemmized_text | |