import re import string import pymorphy2 from nltk.corpus import stopwords from nltk.tokenize import word_tokenize stop_words = set(stopwords.words("russian")) def clean_text(text: str) -> str: text = text.lower() text = re.sub(r"\w*(\w)\1{2,}\w*", "", text) text = re.sub(r"\d+\w*", "", text) text = re.sub(r"\[.*?\]", "", text) text = text.translate(str.maketrans("", "", string.punctuation)) return text def lemmize_and_tokenize_text(text: str) -> list[str]: morph = pymorphy2.MorphAnalyzer() tokens = word_tokenize(text) tokens = [token for token in tokens if token not in stop_words] lemmas = [morph.parse(token)[0].normal_form for token in tokens] return lemmas def data_preprocessing(text: str) -> list[str]: cleaned_text = clean_text(text) lemmized_text = lemmize_and_tokenize_text(cleaned_text) return lemmized_text