import re import string import nltk import pymorphy2 from nltk.tokenize import word_tokenize nltk.download("punkt") def clean_text(text: str) -> str: text = text.lower() text = re.sub(r"\w*(\w)\1{2,}\w*", "", text) text = re.sub(r"\d+\w*", "", text) text = re.sub(r"\[.*?\]", "", text) text = text.translate(str.maketrans("", "", string.punctuation)) return text def lemmize_and_tokenize_text(text: str) -> list[str]: morph = pymorphy2.MorphAnalyzer() tokens = word_tokenize(text) lemmas = [morph.parse(token)[0].normal_form for token in tokens] return lemmas def data_preprocessing(text: str) -> list[str]: cleaned_text = clean_text(text) lemmized_text = lemmize_and_tokenize_text(cleaned_text) return lemmized_text