|
import re |
|
import string |
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
from nltk.corpus import stopwords |
|
stop_words = set(stopwords.words('russian')) |
|
from collections import Counter |
|
from gensim.models import Word2Vec |
|
import pandas as pd |
|
import torch.nn.functional as F |
|
|
|
|
|
HIDDEN_SIZE = 32 |
|
SEQ_LEN = 32 |
|
df = pd.read_json('healthcare_facilities_reviews.jsonl', lines=True) |
|
|
|
def data_preprocessing(text: str) -> str: |
|
text = text.lower() |
|
text = re.sub('<.*?>', '', text) |
|
text = ''.join([c for c in text if c not in string.punctuation]) |
|
text = ' '.join([word for word in text.split() if word not in stop_words]) |
|
text = [word for word in text.split() if not word.isdigit()] |
|
text = ' '.join(text) |
|
return text |
|
|
|
contents = df['content'].tolist() |
|
preprocessed = [data_preprocessing(content) for content in contents] |
|
|
|
corpus = [word for text in preprocessed for word in text.split()] |
|
sorted_words = Counter(corpus).most_common() |
|
|
|
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list: |
|
return list(filter(lambda x: x[1] > n, sorted_words)) |
|
|
|
sorted_words = get_words_by_freq(sorted_words, 100) |
|
sorted_words[-10:] |
|
|
|
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)} |
|
|
|
reviews_int = [] |
|
for text in preprocessed: |
|
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)] |
|
reviews_int.append(r) |
|
|
|
w2v_input = [] |
|
for review in preprocessed: |
|
cur_review = [] |
|
for word in review.split(): |
|
if vocab_to_int.get(word): |
|
cur_review.append(word) |
|
w2v_input.append(cur_review) |
|
|
|
VOCAB_SIZE = len(vocab_to_int) + 1 |
|
|
|
EMBEDDING_DIM = 64 |
|
|
|
wv = Word2Vec( |
|
min_count=1, |
|
vector_size=EMBEDDING_DIM |
|
) |
|
wv.build_vocab(w2v_input) |
|
|
|
wv.train( |
|
corpus_iterable=w2v_input, |
|
total_examples=wv.corpus_count, |
|
epochs=10 |
|
) |
|
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM)) |
|
|
|
|
|
|
|
for word, i in vocab_to_int.items(): |
|
try: |
|
embedding_vector = wv.wv[word] |
|
embedding_matrix[i] = embedding_vector |
|
except KeyError as e: |
|
pass |
|
print(f'{e}: word: {word}') |
|
|
|
|
|
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix)) |
|
|
|
def data_preprocessing(text: str) -> str: |
|
text = text.lower() |
|
text = re.sub('<.*?>', '', text) |
|
text = ''.join([c for c in text if c not in string.punctuation]) |
|
text = ' '.join([word for word in text.split() if word not in stop_words]) |
|
text = [word for word in text.split() if not word.isdigit()] |
|
text = ' '.join(text) |
|
return text |
|
|
|
|
|
|
|
def padding(review_int: list, seq_len: int) -> np.array: |
|
features = np.zeros((len(review_int), seq_len), dtype = int) |
|
for i, review in enumerate(review_int): |
|
if len(review) <= seq_len: |
|
zeros = list(np.zeros(seq_len - len(review))) |
|
new = zeros + review |
|
else: |
|
new = review[: seq_len] |
|
features[i, :] = np.array(new) |
|
|
|
return features |
|
|
|
def preprocess_single_string( |
|
input_string: str, |
|
seq_len: int, |
|
vocab_to_int: dict, |
|
verbose : bool = False |
|
) -> torch.tensor: |
|
preprocessed_string = data_preprocessing(input_string) |
|
result_list = [] |
|
for word in preprocessed_string.split(): |
|
try: |
|
result_list.append(vocab_to_int[word]) |
|
except KeyError as e: |
|
if verbose: |
|
print(f'{e}: not in dictionary!') |
|
pass |
|
result_padded = padding([result_list], seq_len)[0] |
|
|
|
return torch.tensor(result_padded) |
|
|
|
class BahdanauAttention(nn.Module): |
|
def __init__( |
|
self, |
|
hidden_size: int = HIDDEN_SIZE |
|
) -> None: |
|
|
|
super().__init__() |
|
self.hidden_size = hidden_size |
|
self.W = nn.Linear(hidden_size, hidden_size) |
|
self.U = nn.Linear(hidden_size, hidden_size) |
|
self.V = nn.Linear(hidden_size, 1) |
|
self.tanh = nn.Tanh() |
|
|
|
def forward( |
|
self, |
|
keys: torch.Tensor, |
|
query: torch.Tensor |
|
): |
|
|
|
query = query.unsqueeze(1) |
|
r_query = self.W(query) |
|
|
|
r_keys = self.U(keys) |
|
|
|
scores = self.V(torch.tanh(r_query + r_keys)) |
|
scores = scores.squeeze(-1) |
|
att_weights = F.softmax(scores, dim=1) |
|
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) |
|
return context, att_weights |
|
|
|
class LSTMBahdanauAttention(nn.Module): |
|
def __init__(self) -> None: |
|
super().__init__() |
|
|
|
|
|
self.embedding = embedding_layer |
|
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True) |
|
self.attn = BahdanauAttention(HIDDEN_SIZE) |
|
self.clf = nn.Sequential( |
|
nn.Linear(HIDDEN_SIZE, 128), |
|
nn.Dropout(), |
|
nn.Tanh(), |
|
nn.Linear(128, 1) |
|
) |
|
|
|
def forward(self, x): |
|
embeddings = self.embedding(x) |
|
outputs, (h_n, _) = self.lstm(embeddings) |
|
context, att_weights = self.attn(outputs, h_n.squeeze(0)) |
|
out = self.clf(context) |
|
return out, att_weights |
|
|