File size: 1,504 Bytes
087390d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
import joblib
import nltk
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
import string

nltk.download('stopwords')
nltk.download('punkt')

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('russian'))
        self.morph = MorphAnalyzer()

    def preprocess_text(self, text):
        # Удаление всего, что не является буквами или знаками препинания
        clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
        text = clean_pattern.sub('', text)
        url_pattern = re.compile(r'http\S+|www\S+|https\S+')
        text = url_pattern.sub(r'', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.lower()
        tokens = text.split()
        lemmatized_text = ' '.join([self.morph.parse(word)[0].normal_form for word in tokens if word not in self.stop_words])
        return lemmatized_text

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(self.preprocess_text)