vasevooo commited on
Commit
3e188e6
1 Parent(s): 0be825a

Update pages/imdb.py

Browse files
Files changed (1) hide show
  1. pages/imdb.py +138 -91
pages/imdb.py CHANGED
@@ -1,110 +1,157 @@
 
 
1
  import pandas as pd
 
 
2
  import streamlit as st
3
- import torch
 
 
 
 
 
 
 
4
  import transformers
 
 
 
 
5
  import time
6
- import pickle
7
- import numpy as np
8
- from gensim.models import Word2Vec
9
- from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.linear_model import LogisticRegression
 
 
 
11
  import torch.nn as nn
 
 
12
  from data.rnn_preprocessing import (
13
- data_preprocessing,
14
- preprocess_single_string
15
- )
16
-
17
- # Load Word2Vec model
18
- wv = Word2Vec.load('models/word2vec32.model')
19
- embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
20
- vocab_to_int = {word: idx + 1 for idx, word in enumerate(wv.wv.index_to_key)}
21
- for word, i in vocab_to_int.items():
22
- try:
23
- embedding_vector = wv.wv[word]
24
- embedding_matrix[i] = embedding_vector
25
- except KeyError:
26
- pass
27
-
28
-
29
-
30
- # Load LSTM model
31
- embedding_layer32 = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
32
- VOCAB_SIZE = len(vocab_to_int) + 1 # add 1 for the padding token
33
- HIDDEN_DIM = 64
34
- SEQ_LEN = 32
35
-
36
-
37
-
38
-
39
- # Load TF-IDF model
40
- tfidf_model = pickle.load(open('models/modeltfidf.sav', 'rb'))
41
-
42
-
43
-
44
- class LSTMClassifierBi32(nn.Module):
45
- def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
46
- super().__init__()
47
-
48
- self.embedding_dim = embedding_dim
49
- self.hidden_size = hidden_size
50
- self.embedding = embedding_layer32
51
- self.lstm = nn.LSTM(
52
- input_size=self.embedding_dim,
53
- hidden_size=self.hidden_size,
54
- batch_first=True,
55
- bidirectional=True
56
- )
57
- self.clf = nn.Sequential(
58
- nn.Linear(self.hidden_size * 2, 128),
59
- nn.Dropout(),
60
- nn.Sigmoid(),
61
- nn.Linear(128, 64),
62
- nn.Dropout(),
63
- nn.Sigmoid(),
64
- nn.Linear(64, 1)
65
- )
66
-
67
- def forward(self, x):
68
- embeddings = self.embedding(x)
69
- out, (_, _) = self.lstm(embeddings)
70
- out = self.clf(out[:, -1, :])
71
- return out
72
-
73
-
74
- model = LSTMClassifierBi32(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM)
75
- model.load_state_dict(torch.load('models/ltsm_bi1.pt'))
76
- model.eval()
77
-
78
- def predict_sentence(text: str, model: nn.Module):
79
- result = model(preprocess_single_string(text, seq_len=SEQ_LEN, vocab_to_int=vocab_to_int).unsqueeze(0)).sigmoid().round().item()
80
- return 'negative' if result == 0.0 else 'positive'
81
-
82
 
83
  def main():
 
84
  df = pd.read_csv('data/imdb.csv')
85
  df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
86
  reviews = df['review'].tolist()
87
  preprocessed = [data_preprocessing(review) for review in reviews]
88
 
89
- tfid_vectorizer = TfidfVectorizer(max_df=0.5, min_df=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  vect = tfid_vectorizer.fit(preprocessed)
91
  X_tfidf = vect.transform(preprocessed)
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  review = st.text_input('Enter review')
94
 
95
  start1 = time.time()
96
-
 
 
 
 
97
  autotoken = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
98
 
99
  input_tokens = autotoken(
100
- review,
101
- return_tensors='pt',
102
- padding=True,
103
  max_length=10
104
  )
105
-
106
- config = transformers.AutoConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
107
- automodel = transformers.AutoModelForSequenceClassification.from_config(config)
108
  outputs = automodel(**input_tokens)
109
  st.write('Sentiment Predictions')
110
  st.write(f'\nBERT: {[automodel.config.id2label[i.item()] for i in outputs.logits.argmax(-1)]}')
@@ -112,20 +159,20 @@ def main():
112
  st.write(f'{(end1 - start1):.2f} sec')
113
  start2 = time.time()
114
 
115
- st.write(f'LSTM: {predict_sentence(review, model)}')
116
  end2 = time.time()
117
  st.write(f'{(end2 - start2):.2f} sec')
118
-
 
 
 
119
  start4 = time.time()
120
- st.write(f'TF-IDF+Logistic Regression: {predicttf(review)}')
121
  end4 = time.time()
122
  st.write(f'{(end4 - start4):.2f} sec')
123
 
124
 
125
- def predicttf(text):
126
- result = tfidf_model.predict(vect.transform([text]))
127
- return 'negative' if result == [0] else 'positive'
128
-
129
 
130
  if __name__ == '__main__':
131
- main()
 
1
+ import os
2
+ import numpy as np
3
  import pandas as pd
4
+
5
+ import matplotlib.pyplot as plt
6
  import streamlit as st
7
+ import re
8
+ import string
9
+ from collections import Counter
10
+
11
+ from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
12
+
13
+ from gensim.models import Word2Vec
14
+ from string import punctuation
15
  import transformers
16
+ import warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+ from sklearn.model_selection import train_test_split
20
  import time
21
+
22
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
 
23
  from sklearn.linear_model import LogisticRegression
24
+ import pickle
25
+ import torch
26
+ from torch.utils.data import DataLoader, TensorDataset
27
  import torch.nn as nn
28
+ import torchutils as tu
29
+ from torchmetrics.classification import BinaryAccuracy
30
  from data.rnn_preprocessing import (
31
+ data_preprocessing,
32
+ preprocess_single_string
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def main():
36
+ device = 'cpu'
37
  df = pd.read_csv('data/imdb.csv')
38
  df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
39
  reviews = df['review'].tolist()
40
  preprocessed = [data_preprocessing(review) for review in reviews]
41
 
42
+ wv = Word2Vec.load('models/word2vec32.model')
43
+
44
+ words_list = [word for review in preprocessed for word in review.lower().split()]
45
+ for i in words_list:
46
+ ''.join([j for j in i if j not in punctuation])
47
+
48
+ # делаем множество уникальных слов.
49
+ unique_words = set(words_list)
50
+
51
+ # word -> index
52
+ vocab_to_int = {word: idx+1 for idx, word in enumerate(sorted(unique_words))}
53
+
54
+ word_seq = [i.split() for i in preprocessed]
55
+ VOCAB_SIZE = len(vocab_to_int) + 1 # add 1 for the padding token
56
+ EMBEDDING_DIM = 32
57
+ HIDDEN_DIM = 64
58
+ SEQ_LEN = 32
59
+
60
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
61
+
62
+ for word, i in vocab_to_int.items():
63
+ try:
64
+ embedding_vector = wv.wv[word]
65
+ embedding_matrix[i] = embedding_vector
66
+ except KeyError:
67
+ pass
68
+
69
+ embedding_layer32 = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
70
+
71
+
72
+ class LSTMClassifierBi32(nn.Module):
73
+ def __init__(self, embedding_dim: int, hidden_size:int = 32) -> None:
74
+ super().__init__()
75
+
76
+ self.embedding_dim = embedding_dim
77
+ self.hidden_size = hidden_size
78
+ self.embedding = embedding_layer32
79
+ self.lstm = nn.LSTM(
80
+ input_size=self.embedding_dim,
81
+ hidden_size=self.hidden_size,
82
+ batch_first=True,
83
+ bidirectional=True
84
+ )
85
+ self.clf = nn.Sequential(nn.Linear(self.hidden_size*2, 128),
86
+ nn.Dropout(),
87
+ nn.Sigmoid(),
88
+ nn.Linear(128, 64),
89
+ nn.Dropout(),
90
+ nn.Sigmoid(),
91
+ nn.Linear(64, 1)
92
+ )
93
+
94
+ def forward(self, x):
95
+ embeddings = self.embedding(x)
96
+ out, (_, _) = self.lstm(embeddings)
97
+ out = self.clf(out[:,-1,:])
98
+ return out
99
+
100
+ model = LSTMClassifierBi32(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM)
101
+ model.load_state_dict(torch.load('models/ltsm_bi1.pt'))
102
+ model.eval()
103
+
104
+ def predict_sentence(text:str, model: nn.Module):
105
+ result = model.to(device)(preprocess_single_string(text, seq_len=SEQ_LEN, vocab_to_int=vocab_to_int).unsqueeze(0)).sigmoid().round().item()
106
+ return 'negative' if result == 0.0 else 'positive'
107
+
108
+ #Bag Tfidf
109
+ # bagvectorizer = CountVectorizer(max_df=0.5,
110
+ # min_df=5,
111
+ # stop_words="english",)
112
+ # bvect = bagvectorizer.fit(preprocessed)
113
+ # X_bag = bvect.transform(preprocessed)
114
+
115
+ tfid_vectorizer = TfidfVectorizer(
116
+ max_df=0.5,
117
+ min_df=5)
118
  vect = tfid_vectorizer.fit(preprocessed)
119
  X_tfidf = vect.transform(preprocessed)
120
+
121
+ tfidf_model = pickle.load(open('models/modeltfidf.sav', 'rb'))
122
+ # bag_model = pickle.load(open('models/modelbag.sav', 'rb'))
123
+ # def predictbag(text):
124
+ # result = bag_model.predict(vect.transform([text]))
125
+ # return 'negative' if result == [0] else 'positive'
126
+
127
+ def predicttf(text):
128
+ result = tfidf_model.predict(vect.transform([text]))
129
+ return 'negative' if result == [0] else 'positive'
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
  review = st.text_input('Enter review')
140
 
141
  start1 = time.time()
142
+
143
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
144
+ config = AutoConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
145
+
146
+ automodel = AutoModelForSequenceClassification.from_config(config)
147
  autotoken = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
148
 
149
  input_tokens = autotoken(
150
+ review,
151
+ return_tensors='pt',
152
+ padding=True,
153
  max_length=10
154
  )
 
 
 
155
  outputs = automodel(**input_tokens)
156
  st.write('Sentiment Predictions')
157
  st.write(f'\nBERT: {[automodel.config.id2label[i.item()] for i in outputs.logits.argmax(-1)]}')
 
159
  st.write(f'{(end1 - start1):.2f} sec')
160
  start2 = time.time()
161
 
162
+ st.write(f'LTSM: {predict_sentence(review, model)}')
163
  end2 = time.time()
164
  st.write(f'{(end2 - start2):.2f} sec')
165
+ # start3 = time.time()
166
+ # st.write(f'bag+log: {predictbag(review)}')
167
+ # end3 = time.time()
168
+ # st.write(f'{(end3 - start3):.2f} sec')
169
  start4 = time.time()
170
+ st.write(f'tfidf+log: {predicttf(review)}')
171
  end4 = time.time()
172
  st.write(f'{(end4 - start4):.2f} sec')
173
 
174
 
175
+
 
 
 
176
 
177
  if __name__ == '__main__':
178
+ main()