import re import string import numpy as np import torch import torch.nn as nn from nltk.corpus import stopwords stop_words = set(stopwords.words('russian')) from collections import Counter from gensim.models import Word2Vec import pandas as pd import torch.nn.functional as F HIDDEN_SIZE = 32 SEQ_LEN = 32 df = pd.read_json('Data/healthcare_facilities_reviews.jsonl', lines=True) def data_preprocessing(text: str) -> str: text = text.lower() text = re.sub('<.*?>', '', text) # html tags text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation text = ' '.join([word for word in text.split() if word not in stop_words]) text = [word for word in text.split() if not word.isdigit()] text = ' '.join(text) return text contents = df['content'].tolist() preprocessed = [data_preprocessing(content) for content in contents] corpus = [word for text in preprocessed for word in text.split()] sorted_words = Counter(corpus).most_common() def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list: return list(filter(lambda x: x[1] > n, sorted_words)) sorted_words = get_words_by_freq(sorted_words, 100) sorted_words[-10:] vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)} reviews_int = [] for text in preprocessed: r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)] reviews_int.append(r) w2v_input = [] for review in preprocessed: cur_review = [] for word in review.split(): if vocab_to_int.get(word): cur_review.append(word) w2v_input.append(cur_review) VOCAB_SIZE = len(vocab_to_int) + 1 EMBEDDING_DIM = 64 wv = Word2Vec( min_count=1, vector_size=EMBEDDING_DIM ) wv.build_vocab(w2v_input) wv.train( corpus_iterable=w2v_input, total_examples=wv.corpus_count, epochs=10 ) embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM)) # Бежим по всем словам словаря: если слово есть, достаем его вектор # если слова нет, то распечатываем его и пропускаем for word, i in vocab_to_int.items(): try: embedding_vector = wv.wv[word] embedding_matrix[i] = embedding_vector except KeyError as e: pass print(f'{e}: word: {word}') # Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix)) def data_preprocessing(text: str) -> str: text = text.lower() text = re.sub('<.*?>', '', text) # html tags text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation text = ' '.join([word for word in text.split() if word not in stop_words]) text = [word for word in text.split() if not word.isdigit()] text = ' '.join(text) return text def padding(review_int: list, seq_len: int) -> np.array: # type: ignore features = np.zeros((len(review_int), seq_len), dtype = int) for i, review in enumerate(review_int): if len(review) <= seq_len: zeros = list(np.zeros(seq_len - len(review))) new = zeros + review else: new = review[: seq_len] features[i, :] = np.array(new) return features def preprocess_single_string( input_string: str, seq_len: int, vocab_to_int: dict, verbose : bool = False ) -> torch.tensor: preprocessed_string = data_preprocessing(input_string) result_list = [] for word in preprocessed_string.split(): try: result_list.append(vocab_to_int[word]) except KeyError as e: if verbose: print(f'{e}: not in dictionary!') pass result_padded = padding([result_list], seq_len)[0] return torch.tensor(result_padded) class BahdanauAttention(nn.Module): def __init__( self, hidden_size: int = HIDDEN_SIZE ) -> None: super().__init__() self.hidden_size = hidden_size self.W = nn.Linear(hidden_size, hidden_size) self.U = nn.Linear(hidden_size, hidden_size) self.V = nn.Linear(hidden_size, 1) self.tanh = nn.Tanh() def forward( self, keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE ): query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1 scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE return context, att_weights class LSTMBahdanauAttention(nn.Module): def __init__(self) -> None: super().__init__() # self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) self.embedding = embedding_layer self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True) self.attn = BahdanauAttention(HIDDEN_SIZE) self.clf = nn.Sequential( nn.Linear(HIDDEN_SIZE, 128), nn.Dropout(), nn.Tanh(), nn.Linear(128, 1) ) def forward(self, x): embeddings = self.embedding(x) outputs, (h_n, _) = self.lstm(embeddings) context, att_weights = self.attn(outputs, h_n.squeeze(0)) out = self.clf(context) return out, att_weights