Spaces:
Sleeping
Sleeping
dopset and TaskOne
Browse files- dopset.py +80 -0
- pages/TaskOne.py +97 -0
dopset.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
stop_words = set(stopwords.words('english'))
|
6 |
+
import torch.nn as nn
|
7 |
+
import pickle
|
8 |
+
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
EMBEDDING_DIM = 64
|
14 |
+
VOCAB_SIZE = 203310
|
15 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
16 |
+
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
with open('lstm/vocab_to_int.txt', 'rb') as f:
|
22 |
+
vocab_to_int = pickle.load(f)
|
23 |
+
|
24 |
+
class LSTMClassifier(nn.Module):
|
25 |
+
def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
|
26 |
+
super().__init__()
|
27 |
+
|
28 |
+
self.embedding_dim = embedding_dim # создаем эмбединг сайз
|
29 |
+
self.hidden_size = hidden_size # создаем хидден сайз
|
30 |
+
self.embedding = embedding_layer # создаем слои модели
|
31 |
+
|
32 |
+
self.lstm = nn.LSTM(
|
33 |
+
input_size=self.embedding_dim,
|
34 |
+
hidden_size=self.hidden_size,
|
35 |
+
batch_first=True
|
36 |
+
)
|
37 |
+
|
38 |
+
self.clf = nn.Linear(self.hidden_size, 1)
|
39 |
+
|
40 |
+
def forward(self, x):
|
41 |
+
embedding = self.embedding(x)
|
42 |
+
_, (h_n, _) = self.lstm(embedding)
|
43 |
+
out = self.clf(h_n.squeeze())
|
44 |
+
return out
|
45 |
+
|
46 |
+
|
47 |
+
def data_preprocessing(text: str) -> str:
|
48 |
+
text = text.lower()
|
49 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
50 |
+
text = [word for word in text.split() if word not in stop_words]
|
51 |
+
text = ' '.join(text)
|
52 |
+
return text
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
def padding(review_int: list, seq_len: int) -> np.array:
|
57 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
58 |
+
for i, review in enumerate(review_int):
|
59 |
+
if len(review) <= seq_len:
|
60 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
61 |
+
new = zeros + review
|
62 |
+
else:
|
63 |
+
new = review[: seq_len]
|
64 |
+
features[i, :] = np.array(new)
|
65 |
+
|
66 |
+
return features
|
67 |
+
|
68 |
+
def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict = vocab_to_int) -> list:
|
69 |
+
preprocessed_string = data_preprocessing(input_string)
|
70 |
+
result_list = []
|
71 |
+
for word in preprocessed_string.split():
|
72 |
+
try:
|
73 |
+
result_list.append(vocab_to_int[word])
|
74 |
+
except KeyError as e:
|
75 |
+
print(f'{e}: not in dictionary!')
|
76 |
+
result_padded = padding([result_list], seq_len)[0]
|
77 |
+
|
78 |
+
return torch.tensor(result_padded)
|
79 |
+
|
80 |
+
|
pages/TaskOne.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.linear_model import LogisticRegression
|
6 |
+
import pickle
|
7 |
+
import time
|
8 |
+
from dopset import (LSTMClassifier, preprocess_single_string)
|
9 |
+
import numpy as np
|
10 |
+
import pandas as pd
|
11 |
+
from transformers import DistilBertModel, DistilBertTokenizer
|
12 |
+
|
13 |
+
loaded = pickle.load(open('BertWeight/log.pkl', "rb"))
|
14 |
+
model_BERT = DistilBertModel.from_pretrained("BertWeight/pt_save_pretrained")
|
15 |
+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
16 |
+
|
17 |
+
st.title('Отзывы')
|
18 |
+
|
19 |
+
@st.cache_resource
|
20 |
+
|
21 |
+
def load_model_and_vectorizer():
|
22 |
+
|
23 |
+
loaded_vectorizer = pickle.load(open('tfidf/tfidf_vectorizer.pkl', 'rb'))
|
24 |
+
loaded_model = pickle.load(open('tfidf/logistic_regression_model.pkl', 'rb'))
|
25 |
+
return loaded_vectorizer, loaded_model
|
26 |
+
|
27 |
+
def logreg(text):
|
28 |
+
weight_vect, weight_model = load_model_and_vectorizer()
|
29 |
+
|
30 |
+
start_time = time.time()
|
31 |
+
input_ids = weight_vect.transform([text])
|
32 |
+
output = weight_model.predict(input_ids)
|
33 |
+
end_time = time.time()
|
34 |
+
|
35 |
+
if output == 1:
|
36 |
+
return "положительный", round(end_time - start_time, 5)
|
37 |
+
else:
|
38 |
+
return "Негативный", round(end_time - start_time, 5)
|
39 |
+
|
40 |
+
def lstm_(text):
|
41 |
+
EMBEDDING_DIM = 64
|
42 |
+
HIDDEN_DIM = 16
|
43 |
+
DEVICE = 'cpu'
|
44 |
+
|
45 |
+
model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM).to(DEVICE)
|
46 |
+
model.load_state_dict(torch.load('lstm/lstm_weights.pt', map_location=DEVICE))
|
47 |
+
start_time = time.time()
|
48 |
+
pred = model(preprocess_single_string(text, seq_len=128).unsqueeze(0).to(DEVICE)).sigmoid().round().item()
|
49 |
+
end_time = time.time()
|
50 |
+
if pred == 1:
|
51 |
+
return "положительный", round(end_time - start_time, 5)
|
52 |
+
else:
|
53 |
+
return "Негативный", round(end_time - start_time, 5)
|
54 |
+
|
55 |
+
def bert_(text, model, loaded_model):
|
56 |
+
start_time = time.time()
|
57 |
+
tokenized_text = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=64)
|
58 |
+
input_ids = torch.tensor(tokenized_text).unsqueeze(0)
|
59 |
+
with torch.no_grad():
|
60 |
+
outputs = model(input_ids)
|
61 |
+
vectors = outputs[0][:,0,:].detach().cpu().numpy()
|
62 |
+
|
63 |
+
prediction = loaded_model.predict(vectors).item()
|
64 |
+
end_time = time.time()
|
65 |
+
if prediction == 1:
|
66 |
+
return "положительный", round(end_time - start_time, 5)
|
67 |
+
else:
|
68 |
+
return "Негативный", round(end_time - start_time, 5)
|
69 |
+
|
70 |
+
|
71 |
+
table_f1 = {'Model': ['Tf-IDF + logreg', 'LSTM', 'Bert'],
|
72 |
+
'F1-score':['0.91', '0.94', '0.74']}
|
73 |
+
|
74 |
+
df = pd.DataFrame(table_f1)
|
75 |
+
|
76 |
+
text = st.text_input("Напишите отзыв")
|
77 |
+
|
78 |
+
if text:
|
79 |
+
|
80 |
+
rate, time_ = logreg(text)
|
81 |
+
st.markdown('### Tf-IDF + logreg')
|
82 |
+
st.write('Отзыв:', rate)
|
83 |
+
st.write('Время:', time_)
|
84 |
+
|
85 |
+
rate_lstm, time_lstm = lstm_(text)
|
86 |
+
st.markdown('### LSTM')
|
87 |
+
st.write('Отзыв:', rate_lstm)
|
88 |
+
st.write('Время:', time_lstm)
|
89 |
+
|
90 |
+
rate_bert, time_bert = bert_(text, model_BERT, loaded)
|
91 |
+
st.markdown('### BERT')
|
92 |
+
st.write('Отзыв:', rate_bert)
|
93 |
+
st.write('Время:', time_bert)
|
94 |
+
st.table(df)
|
95 |
+
|
96 |
+
|
97 |
+
|