Spaces:
Sleeping
Sleeping
import re | |
import pandas as pd | |
import numpy as np | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.decomposition import TruncatedSVD | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.preprocessing import Normalizer | |
import joblib | |
import nltk | |
from nltk.corpus import stopwords | |
from pymorphy2 import MorphAnalyzer | |
import string | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
class TextPreprocessor(BaseEstimator, TransformerMixin): | |
def __init__(self): | |
self.stop_words = set(stopwords.words('russian')) | |
self.morph = MorphAnalyzer() | |
def preprocess_text(self, text): | |
# Удаление всего, что не является буквами или знаками препинания | |
clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]') | |
text = clean_pattern.sub('', text) | |
url_pattern = re.compile(r'http\S+|www\S+|https\S+') | |
text = url_pattern.sub(r'', text) | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
text = text.lower() | |
tokens = text.split() | |
lemmatized_text = ' '.join([self.morph.parse(word)[0].normal_form for word in tokens if word not in self.stop_words]) | |
return lemmatized_text | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
return X.apply(self.preprocess_text) |