Nlp_ / model_file.py
osheina's picture
Upload 16 files
b6aa467 verified
raw
history blame
No virus
5.88 kB
import re
import string
import numpy as np
import torch
import torch.nn as nn
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
from collections import Counter
from gensim.models import Word2Vec
import pandas as pd
import torch.nn.functional as F
HIDDEN_SIZE = 32
SEQ_LEN = 32
df = pd.read_json('healthcare_facilities_reviews.jsonl', lines=True)
def data_preprocessing(text: str) -> str:
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
contents = df['content'].tolist()
preprocessed = [data_preprocessing(content) for content in contents]
corpus = [word for text in preprocessed for word in text.split()]
sorted_words = Counter(corpus).most_common()
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
return list(filter(lambda x: x[1] > n, sorted_words))
sorted_words = get_words_by_freq(sorted_words, 100)
sorted_words[-10:]
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
reviews_int = []
for text in preprocessed:
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
reviews_int.append(r)
w2v_input = []
for review in preprocessed:
cur_review = []
for word in review.split():
if vocab_to_int.get(word):
cur_review.append(word)
w2v_input.append(cur_review)
VOCAB_SIZE = len(vocab_to_int) + 1
EMBEDDING_DIM = 64
wv = Word2Vec(
min_count=1, # минимальная встречаемость в корпусе
vector_size=EMBEDDING_DIM # размерность вектора для слова
)
wv.build_vocab(w2v_input)
wv.train(
corpus_iterable=w2v_input,
total_examples=wv.corpus_count,
epochs=10
)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
# Бежим по всем словам словаря: если слово есть, достаем его вектор
# если слова нет, то распечатываем его и пропускаем
for word, i in vocab_to_int.items():
try:
embedding_vector = wv.wv[word]
embedding_matrix[i] = embedding_vector
except KeyError as e:
pass
print(f'{e}: word: {word}')
# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
def data_preprocessing(text: str) -> str:
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
features = np.zeros((len(review_int), seq_len), dtype = int)
for i, review in enumerate(review_int):
if len(review) <= seq_len:
zeros = list(np.zeros(seq_len - len(review)))
new = zeros + review
else:
new = review[: seq_len]
features[i, :] = np.array(new)
return features
def preprocess_single_string(
input_string: str,
seq_len: int,
vocab_to_int: dict,
verbose : bool = False
) -> torch.tensor:
preprocessed_string = data_preprocessing(input_string)
result_list = []
for word in preprocessed_string.split():
try:
result_list.append(vocab_to_int[word])
except KeyError as e:
if verbose:
print(f'{e}: not in dictionary!')
pass
result_padded = padding([result_list], seq_len)[0]
return torch.tensor(result_padded)
class BahdanauAttention(nn.Module):
def __init__(
self,
hidden_size: int = HIDDEN_SIZE
) -> None:
super().__init__()
self.hidden_size = hidden_size
self.W = nn.Linear(hidden_size, hidden_size)
self.U = nn.Linear(hidden_size, hidden_size)
self.V = nn.Linear(hidden_size, 1)
self.tanh = nn.Tanh()
def forward(
self,
keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
):
query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
return context, att_weights
class LSTMBahdanauAttention(nn.Module):
def __init__(self) -> None:
super().__init__()
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
self.embedding = embedding_layer
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
self.attn = BahdanauAttention(HIDDEN_SIZE)
self.clf = nn.Sequential(
nn.Linear(HIDDEN_SIZE, 128),
nn.Dropout(),
nn.Tanh(),
nn.Linear(128, 1)
)
def forward(self, x):
embeddings = self.embedding(x)
outputs, (h_n, _) = self.lstm(embeddings)
context, att_weights = self.attn(outputs, h_n.squeeze(0))
out = self.clf(context)
return out, att_weights