NLP_project / model_file.py
osheina's picture
Upload 16 files
f987f4c verified
import re
import string
import numpy as np
import torch
import torch.nn as nn
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
from collections import Counter
from gensim.models import Word2Vec
import pandas as pd
import torch.nn.functional as F
HIDDEN_SIZE = 32
SEQ_LEN = 32
df = pd.read_json('/Users/olgaseina/ds-phase-2/10-nlp/data/tg_channels/healthcare_facilities_reviews.jsonl', lines=True)
def data_preprocessing(text: str) -> str:
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
contents = df['content'].tolist()
preprocessed = [data_preprocessing(content) for content in contents]
corpus = [word for text in preprocessed for word in text.split()]
sorted_words = Counter(corpus).most_common()
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
return list(filter(lambda x: x[1] > n, sorted_words))
sorted_words = get_words_by_freq(sorted_words, 100)
sorted_words[-10:]
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
reviews_int = []
for text in preprocessed:
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
reviews_int.append(r)
w2v_input = []
for review in preprocessed:
cur_review = []
for word in review.split():
if vocab_to_int.get(word):
cur_review.append(word)
w2v_input.append(cur_review)
VOCAB_SIZE = len(vocab_to_int) + 1
EMBEDDING_DIM = 64
wv = Word2Vec(
min_count=1, # минимальная встречаемость в корпусе
vector_size=EMBEDDING_DIM # размерность вектора для слова
)
wv.build_vocab(w2v_input)
wv.train(
corpus_iterable=w2v_input,
total_examples=wv.corpus_count,
epochs=10
)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
# Бежим по всем словам словаря: если слово есть, достаем его вектор
# если слова нет, то распечатываем его и пропускаем
for word, i in vocab_to_int.items():
try:
embedding_vector = wv.wv[word]
embedding_matrix[i] = embedding_vector
except KeyError as e:
pass
print(f'{e}: word: {word}')
# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
def data_preprocessing(text: str) -> str:
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
features = np.zeros((len(review_int), seq_len), dtype = int)
for i, review in enumerate(review_int):
if len(review) <= seq_len:
zeros = list(np.zeros(seq_len - len(review)))
new = zeros + review
else:
new = review[: seq_len]
features[i, :] = np.array(new)
return features
def preprocess_single_string(
input_string: str,
seq_len: int,
vocab_to_int: dict,
verbose : bool = False
) -> torch.tensor:
preprocessed_string = data_preprocessing(input_string)
result_list = []
for word in preprocessed_string.split():
try:
result_list.append(vocab_to_int[word])
except KeyError as e:
if verbose:
print(f'{e}: not in dictionary!')
pass
result_padded = padding([result_list], seq_len)[0]
return torch.tensor(result_padded)
class BahdanauAttention(nn.Module):
def __init__(
self,
hidden_size: int = HIDDEN_SIZE
) -> None:
super().__init__()
self.hidden_size = hidden_size
self.W = nn.Linear(hidden_size, hidden_size)
self.U = nn.Linear(hidden_size, hidden_size)
self.V = nn.Linear(hidden_size, 1)
self.tanh = nn.Tanh()
def forward(
self,
keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
):
query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
return context, att_weights
class LSTMBahdanauAttention(nn.Module):
def __init__(self) -> None:
super().__init__()
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
self.embedding = embedding_layer
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
self.attn = BahdanauAttention(HIDDEN_SIZE)
self.clf = nn.Sequential(
nn.Linear(HIDDEN_SIZE, 128),
nn.Dropout(),
nn.Tanh(),
nn.Linear(128, 1)
)
def forward(self, x):
embeddings = self.embedding(x)
outputs, (h_n, _) = self.lstm(embeddings)
context, att_weights = self.attn(outputs, h_n.squeeze(0))
out = self.clf(context)
return out, att_weights