import string import nltk from nltk.corpus import stopwords from nltk.tokenize import WordPunctTokenizer import pymorphy2 class DataPreprocessor: def __init__(self): nltk.download('stopwords') self.morph = pymorphy2.MorphAnalyzer() self.tokenizer = WordPunctTokenizer() self.punctuation = set(string.punctuation) self.stopwords_russian = stopwords.words("russian") self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation) def tokenize_data(self, texts): tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts] return tokens def lemmatize_tokens_string(self, tokens_string): new_tokens = [] for token in tokens_string: if token not in self.stop_tokens: new_tokens.append(self.morph.parse(token)[0].normal_form) return new_tokens def lemmatize_tokens(self, tokens): for i in range(len(tokens)): tokens[i] = self.lemmatize_tokens_string(tokens[i]) def preprocess_texts(self, texts): tokens = self.tokenize_data(texts) self.lemmatize_tokens(tokens) return tokens