Spaces:
Sleeping
Sleeping
Update dopset.py
Browse files
dopset.py
CHANGED
@@ -1,80 +1,81 @@
|
|
1 |
-
import string
|
2 |
-
import torch
|
3 |
-
import numpy as np
|
4 |
-
from nltk.corpus import stopwords
|
5 |
-
stop_words = set(stopwords.words('english'))
|
6 |
-
import torch.nn as nn
|
7 |
-
import pickle
|
8 |
-
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
|
9 |
-
from sklearn.linear_model import LogisticRegression
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
self.
|
30 |
-
self.
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
text =
|
50 |
-
text = [
|
51 |
-
text =
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
1 |
+
import string
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
stop_words = set(stopwords.words('english'))
|
6 |
+
import torch.nn as nn
|
7 |
+
import pickle
|
8 |
+
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
import nltk
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
EMBEDDING_DIM = 64
|
15 |
+
VOCAB_SIZE = 203310
|
16 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
17 |
+
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
with open('lstm/vocab_to_int.txt', 'rb') as f:
|
23 |
+
vocab_to_int = pickle.load(f)
|
24 |
+
|
25 |
+
class LSTMClassifier(nn.Module):
|
26 |
+
def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
|
27 |
+
super().__init__()
|
28 |
+
|
29 |
+
self.embedding_dim = embedding_dim # создаем эмбединг сайз
|
30 |
+
self.hidden_size = hidden_size # создаем хидден сайз
|
31 |
+
self.embedding = embedding_layer # создаем слои модели
|
32 |
+
|
33 |
+
self.lstm = nn.LSTM(
|
34 |
+
input_size=self.embedding_dim,
|
35 |
+
hidden_size=self.hidden_size,
|
36 |
+
batch_first=True
|
37 |
+
)
|
38 |
+
|
39 |
+
self.clf = nn.Linear(self.hidden_size, 1)
|
40 |
+
|
41 |
+
def forward(self, x):
|
42 |
+
embedding = self.embedding(x)
|
43 |
+
_, (h_n, _) = self.lstm(embedding)
|
44 |
+
out = self.clf(h_n.squeeze())
|
45 |
+
return out
|
46 |
+
|
47 |
+
|
48 |
+
def data_preprocessing(text: str) -> str:
|
49 |
+
text = text.lower()
|
50 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
51 |
+
text = [word for word in text.split() if word not in stop_words]
|
52 |
+
text = ' '.join(text)
|
53 |
+
return text
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
def padding(review_int: list, seq_len: int) -> np.array:
|
58 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
59 |
+
for i, review in enumerate(review_int):
|
60 |
+
if len(review) <= seq_len:
|
61 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
62 |
+
new = zeros + review
|
63 |
+
else:
|
64 |
+
new = review[: seq_len]
|
65 |
+
features[i, :] = np.array(new)
|
66 |
+
|
67 |
+
return features
|
68 |
+
|
69 |
+
def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict = vocab_to_int) -> list:
|
70 |
+
preprocessed_string = data_preprocessing(input_string)
|
71 |
+
result_list = []
|
72 |
+
for word in preprocessed_string.split():
|
73 |
+
try:
|
74 |
+
result_list.append(vocab_to_int[word])
|
75 |
+
except KeyError as e:
|
76 |
+
print(f'{e}: not in dictionary!')
|
77 |
+
result_padded = padding([result_list], seq_len)[0]
|
78 |
+
|
79 |
+
return torch.tensor(result_padded)
|
80 |
+
|
81 |
+
|