File size: 1,205 Bytes
d1ef404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import pymorphy2


class DataPreprocessor:

    def __init__(self):
        nltk.download('stopwords')
        self.morph = pymorphy2.MorphAnalyzer()
        self.tokenizer = WordPunctTokenizer()
        self.punctuation = set(string.punctuation)
        self.stopwords_russian = stopwords.words("russian")
        self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation)

    def tokenize_data(self, texts):
        tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts]
        return tokens

    def lemmatize_tokens_string(self, tokens_string):
        new_tokens = []
        for token in tokens_string:
            if token not in self.stop_tokens:
                new_tokens.append(self.morph.parse(token)[0].normal_form)
        return new_tokens

    def lemmatize_tokens(self, tokens):
        for i in range(len(tokens)):
            tokens[i] = self.lemmatize_tokens_string(tokens[i])

    def preprocess_texts(self, texts):
        tokens = self.tokenize_data(texts)
        self.lemmatize_tokens(tokens)
        return tokens