File size: 5,882 Bytes
b6aa467 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import re
import string
import numpy as np
import torch
import torch.nn as nn
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
from collections import Counter
from gensim.models import Word2Vec
import pandas as pd
import torch.nn.functional as F
HIDDEN_SIZE = 32
SEQ_LEN = 32
df = pd.read_json('healthcare_facilities_reviews.jsonl', lines=True)
def data_preprocessing(text: str) -> str:
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
contents = df['content'].tolist()
preprocessed = [data_preprocessing(content) for content in contents]
corpus = [word for text in preprocessed for word in text.split()]
sorted_words = Counter(corpus).most_common()
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
return list(filter(lambda x: x[1] > n, sorted_words))
sorted_words = get_words_by_freq(sorted_words, 100)
sorted_words[-10:]
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
reviews_int = []
for text in preprocessed:
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
reviews_int.append(r)
w2v_input = []
for review in preprocessed:
cur_review = []
for word in review.split():
if vocab_to_int.get(word):
cur_review.append(word)
w2v_input.append(cur_review)
VOCAB_SIZE = len(vocab_to_int) + 1
EMBEDDING_DIM = 64
wv = Word2Vec(
min_count=1, # минимальная встречаемость в корпусе
vector_size=EMBEDDING_DIM # размерность вектора для слова
)
wv.build_vocab(w2v_input)
wv.train(
corpus_iterable=w2v_input,
total_examples=wv.corpus_count,
epochs=10
)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
# Бежим по всем словам словаря: если слово есть, достаем его вектор
# если слова нет, то распечатываем его и пропускаем
for word, i in vocab_to_int.items():
try:
embedding_vector = wv.wv[word]
embedding_matrix[i] = embedding_vector
except KeyError as e:
pass
print(f'{e}: word: {word}')
# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
def data_preprocessing(text: str) -> str:
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
features = np.zeros((len(review_int), seq_len), dtype = int)
for i, review in enumerate(review_int):
if len(review) <= seq_len:
zeros = list(np.zeros(seq_len - len(review)))
new = zeros + review
else:
new = review[: seq_len]
features[i, :] = np.array(new)
return features
def preprocess_single_string(
input_string: str,
seq_len: int,
vocab_to_int: dict,
verbose : bool = False
) -> torch.tensor:
preprocessed_string = data_preprocessing(input_string)
result_list = []
for word in preprocessed_string.split():
try:
result_list.append(vocab_to_int[word])
except KeyError as e:
if verbose:
print(f'{e}: not in dictionary!')
pass
result_padded = padding([result_list], seq_len)[0]
return torch.tensor(result_padded)
class BahdanauAttention(nn.Module):
def __init__(
self,
hidden_size: int = HIDDEN_SIZE
) -> None:
super().__init__()
self.hidden_size = hidden_size
self.W = nn.Linear(hidden_size, hidden_size)
self.U = nn.Linear(hidden_size, hidden_size)
self.V = nn.Linear(hidden_size, 1)
self.tanh = nn.Tanh()
def forward(
self,
keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
):
query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
return context, att_weights
class LSTMBahdanauAttention(nn.Module):
def __init__(self) -> None:
super().__init__()
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
self.embedding = embedding_layer
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
self.attn = BahdanauAttention(HIDDEN_SIZE)
self.clf = nn.Sequential(
nn.Linear(HIDDEN_SIZE, 128),
nn.Dropout(),
nn.Tanh(),
nn.Linear(128, 1)
)
def forward(self, x):
embeddings = self.embedding(x)
outputs, (h_n, _) = self.lstm(embeddings)
context, att_weights = self.attn(outputs, h_n.squeeze(0))
out = self.clf(context)
return out, att_weights
|