nlp-bert-team / models /model1 /lstm_preprocessor.py
VerVelVel's picture
images
961ee03
import string
import numpy as np
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
import joblib
import re
class TextPreprocessorWord2Vec(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('russian'))
self.vocab_to_int = joblib.load('models/model1/lstm_vocab_to_int.pkl')
def preprocess_text(self, text):
# Преобразование к нижнему регистру
text = text.lower()
# Удаление HTML тегов
text = re.sub('<.*?>', '', text)
# Удаление пунктуации
text = ''.join([c for c in text if c not in string.punctuation])
# Удаление стоп-слов
text = ' '.join([word for word in text.split() if word not in self.stop_words])
# Удаление цифр
text = ' '.join([word for word in text.split() if not word.isdigit()])
return text
def padding(review_int: list, seq_len: int) -> np.array:
features = np.zeros((len(review_int), seq_len), dtype=int)
for i, review in enumerate(review_int):
if len(review) <= seq_len:
zeros = list(np.zeros(seq_len - len(review)))
new = zeros + review
else:
new = review[:seq_len]
features[i, :] = np.array(new)
return features
@staticmethod
def preprocess_single_string(
input_string: str,
seq_len: int,
vocab_to_int: dict,
verbose: bool = False
) -> torch.tensor:
preprocessed_string = TextPreprocessorWord2Vec().preprocess_text(input_string)
result_list = []
for word in preprocessed_string.split():
try:
result_list.append(vocab_to_int[word])
except KeyError as e:
if verbose:
print(f'{e}: not in dictionary!')
pass
result_padded = TextPreprocessorWord2Vec.padding([result_list], 64)[0]
return torch.tensor(result_padded)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return self.preprocess_single_string(X, 64, self.vocab_to_int)