Teery commited on
Commit
d128c4f
1 Parent(s): 4c3876c

dopset and TaskOne

Browse files
Files changed (2) hide show
  1. dopset.py +80 -0
  2. pages/TaskOne.py +97 -0
dopset.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import torch
3
+ import numpy as np
4
+ from nltk.corpus import stopwords
5
+ stop_words = set(stopwords.words('english'))
6
+ import torch.nn as nn
7
+ import pickle
8
+ from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
9
+ from sklearn.linear_model import LogisticRegression
10
+
11
+
12
+
13
+ EMBEDDING_DIM = 64
14
+ VOCAB_SIZE = 203310
15
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
16
+ embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
17
+
18
+
19
+
20
+
21
+ with open('lstm/vocab_to_int.txt', 'rb') as f:
22
+ vocab_to_int = pickle.load(f)
23
+
24
+ class LSTMClassifier(nn.Module):
25
+ def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
26
+ super().__init__()
27
+
28
+ self.embedding_dim = embedding_dim # создаем эмбединг сайз
29
+ self.hidden_size = hidden_size # создаем хидден сайз
30
+ self.embedding = embedding_layer # создаем слои модели
31
+
32
+ self.lstm = nn.LSTM(
33
+ input_size=self.embedding_dim,
34
+ hidden_size=self.hidden_size,
35
+ batch_first=True
36
+ )
37
+
38
+ self.clf = nn.Linear(self.hidden_size, 1)
39
+
40
+ def forward(self, x):
41
+ embedding = self.embedding(x)
42
+ _, (h_n, _) = self.lstm(embedding)
43
+ out = self.clf(h_n.squeeze())
44
+ return out
45
+
46
+
47
+ def data_preprocessing(text: str) -> str:
48
+ text = text.lower()
49
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
50
+ text = [word for word in text.split() if word not in stop_words]
51
+ text = ' '.join(text)
52
+ return text
53
+
54
+
55
+
56
+ def padding(review_int: list, seq_len: int) -> np.array:
57
+ features = np.zeros((len(review_int), seq_len), dtype = int)
58
+ for i, review in enumerate(review_int):
59
+ if len(review) <= seq_len:
60
+ zeros = list(np.zeros(seq_len - len(review)))
61
+ new = zeros + review
62
+ else:
63
+ new = review[: seq_len]
64
+ features[i, :] = np.array(new)
65
+
66
+ return features
67
+
68
+ def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict = vocab_to_int) -> list:
69
+ preprocessed_string = data_preprocessing(input_string)
70
+ result_list = []
71
+ for word in preprocessed_string.split():
72
+ try:
73
+ result_list.append(vocab_to_int[word])
74
+ except KeyError as e:
75
+ print(f'{e}: not in dictionary!')
76
+ result_padded = padding([result_list], seq_len)[0]
77
+
78
+ return torch.tensor(result_padded)
79
+
80
+
pages/TaskOne.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import torch.nn as nn
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.linear_model import LogisticRegression
6
+ import pickle
7
+ import time
8
+ from dopset import (LSTMClassifier, preprocess_single_string)
9
+ import numpy as np
10
+ import pandas as pd
11
+ from transformers import DistilBertModel, DistilBertTokenizer
12
+
13
+ loaded = pickle.load(open('BertWeight/log.pkl', "rb"))
14
+ model_BERT = DistilBertModel.from_pretrained("BertWeight/pt_save_pretrained")
15
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
16
+
17
+ st.title('Отзывы')
18
+
19
+ @st.cache_resource
20
+
21
+ def load_model_and_vectorizer():
22
+
23
+ loaded_vectorizer = pickle.load(open('tfidf/tfidf_vectorizer.pkl', 'rb'))
24
+ loaded_model = pickle.load(open('tfidf/logistic_regression_model.pkl', 'rb'))
25
+ return loaded_vectorizer, loaded_model
26
+
27
+ def logreg(text):
28
+ weight_vect, weight_model = load_model_and_vectorizer()
29
+
30
+ start_time = time.time()
31
+ input_ids = weight_vect.transform([text])
32
+ output = weight_model.predict(input_ids)
33
+ end_time = time.time()
34
+
35
+ if output == 1:
36
+ return "положительный", round(end_time - start_time, 5)
37
+ else:
38
+ return "Негативный", round(end_time - start_time, 5)
39
+
40
+ def lstm_(text):
41
+ EMBEDDING_DIM = 64
42
+ HIDDEN_DIM = 16
43
+ DEVICE = 'cpu'
44
+
45
+ model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM).to(DEVICE)
46
+ model.load_state_dict(torch.load('lstm/lstm_weights.pt', map_location=DEVICE))
47
+ start_time = time.time()
48
+ pred = model(preprocess_single_string(text, seq_len=128).unsqueeze(0).to(DEVICE)).sigmoid().round().item()
49
+ end_time = time.time()
50
+ if pred == 1:
51
+ return "положительный", round(end_time - start_time, 5)
52
+ else:
53
+ return "Негативный", round(end_time - start_time, 5)
54
+
55
+ def bert_(text, model, loaded_model):
56
+ start_time = time.time()
57
+ tokenized_text = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=64)
58
+ input_ids = torch.tensor(tokenized_text).unsqueeze(0)
59
+ with torch.no_grad():
60
+ outputs = model(input_ids)
61
+ vectors = outputs[0][:,0,:].detach().cpu().numpy()
62
+
63
+ prediction = loaded_model.predict(vectors).item()
64
+ end_time = time.time()
65
+ if prediction == 1:
66
+ return "положительный", round(end_time - start_time, 5)
67
+ else:
68
+ return "Негативный", round(end_time - start_time, 5)
69
+
70
+
71
+ table_f1 = {'Model': ['Tf-IDF + logreg', 'LSTM', 'Bert'],
72
+ 'F1-score':['0.91', '0.94', '0.74']}
73
+
74
+ df = pd.DataFrame(table_f1)
75
+
76
+ text = st.text_input("Напишите отзыв")
77
+
78
+ if text:
79
+
80
+ rate, time_ = logreg(text)
81
+ st.markdown('### Tf-IDF + logreg')
82
+ st.write('Отзыв:', rate)
83
+ st.write('Время:', time_)
84
+
85
+ rate_lstm, time_lstm = lstm_(text)
86
+ st.markdown('### LSTM')
87
+ st.write('Отзыв:', rate_lstm)
88
+ st.write('Время:', time_lstm)
89
+
90
+ rate_bert, time_bert = bert_(text, model_BERT, loaded)
91
+ st.markdown('### BERT')
92
+ st.write('Отзыв:', rate_bert)
93
+ st.write('Время:', time_bert)
94
+ st.table(df)
95
+
96
+
97
+