Teery commited on
Commit
5e65e3b
1 Parent(s): c8b1b4a

Update dopset.py

Browse files
Files changed (1) hide show
  1. dopset.py +81 -80
dopset.py CHANGED
@@ -1,80 +1,81 @@
1
- import string
2
- import torch
3
- import numpy as np
4
- from nltk.corpus import stopwords
5
- stop_words = set(stopwords.words('english'))
6
- import torch.nn as nn
7
- import pickle
8
- from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
9
- from sklearn.linear_model import LogisticRegression
10
-
11
-
12
-
13
- EMBEDDING_DIM = 64
14
- VOCAB_SIZE = 203310
15
- embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
16
- embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
17
-
18
-
19
-
20
-
21
- with open('lstm/vocab_to_int.txt', 'rb') as f:
22
- vocab_to_int = pickle.load(f)
23
-
24
- class LSTMClassifier(nn.Module):
25
- def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
26
- super().__init__()
27
-
28
- self.embedding_dim = embedding_dim # создаем эмбединг сайз
29
- self.hidden_size = hidden_size # создаем хидден сайз
30
- self.embedding = embedding_layer # создаем слои модели
31
-
32
- self.lstm = nn.LSTM(
33
- input_size=self.embedding_dim,
34
- hidden_size=self.hidden_size,
35
- batch_first=True
36
- )
37
-
38
- self.clf = nn.Linear(self.hidden_size, 1)
39
-
40
- def forward(self, x):
41
- embedding = self.embedding(x)
42
- _, (h_n, _) = self.lstm(embedding)
43
- out = self.clf(h_n.squeeze())
44
- return out
45
-
46
-
47
- def data_preprocessing(text: str) -> str:
48
- text = text.lower()
49
- text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
50
- text = [word for word in text.split() if word not in stop_words]
51
- text = ' '.join(text)
52
- return text
53
-
54
-
55
-
56
- def padding(review_int: list, seq_len: int) -> np.array:
57
- features = np.zeros((len(review_int), seq_len), dtype = int)
58
- for i, review in enumerate(review_int):
59
- if len(review) <= seq_len:
60
- zeros = list(np.zeros(seq_len - len(review)))
61
- new = zeros + review
62
- else:
63
- new = review[: seq_len]
64
- features[i, :] = np.array(new)
65
-
66
- return features
67
-
68
- def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict = vocab_to_int) -> list:
69
- preprocessed_string = data_preprocessing(input_string)
70
- result_list = []
71
- for word in preprocessed_string.split():
72
- try:
73
- result_list.append(vocab_to_int[word])
74
- except KeyError as e:
75
- print(f'{e}: not in dictionary!')
76
- result_padded = padding([result_list], seq_len)[0]
77
-
78
- return torch.tensor(result_padded)
79
-
80
-
 
 
1
+ import string
2
+ import torch
3
+ import numpy as np
4
+ from nltk.corpus import stopwords
5
+ stop_words = set(stopwords.words('english'))
6
+ import torch.nn as nn
7
+ import pickle
8
+ from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
9
+ from sklearn.linear_model import LogisticRegression
10
+ import nltk
11
+
12
+
13
+
14
+ EMBEDDING_DIM = 64
15
+ VOCAB_SIZE = 203310
16
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
17
+ embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
18
+
19
+
20
+
21
+
22
+ with open('lstm/vocab_to_int.txt', 'rb') as f:
23
+ vocab_to_int = pickle.load(f)
24
+
25
+ class LSTMClassifier(nn.Module):
26
+ def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
27
+ super().__init__()
28
+
29
+ self.embedding_dim = embedding_dim # создаем эмбединг сайз
30
+ self.hidden_size = hidden_size # создаем хидден сайз
31
+ self.embedding = embedding_layer # создаем слои модели
32
+
33
+ self.lstm = nn.LSTM(
34
+ input_size=self.embedding_dim,
35
+ hidden_size=self.hidden_size,
36
+ batch_first=True
37
+ )
38
+
39
+ self.clf = nn.Linear(self.hidden_size, 1)
40
+
41
+ def forward(self, x):
42
+ embedding = self.embedding(x)
43
+ _, (h_n, _) = self.lstm(embedding)
44
+ out = self.clf(h_n.squeeze())
45
+ return out
46
+
47
+
48
+ def data_preprocessing(text: str) -> str:
49
+ text = text.lower()
50
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
51
+ text = [word for word in text.split() if word not in stop_words]
52
+ text = ' '.join(text)
53
+ return text
54
+
55
+
56
+
57
+ def padding(review_int: list, seq_len: int) -> np.array:
58
+ features = np.zeros((len(review_int), seq_len), dtype = int)
59
+ for i, review in enumerate(review_int):
60
+ if len(review) <= seq_len:
61
+ zeros = list(np.zeros(seq_len - len(review)))
62
+ new = zeros + review
63
+ else:
64
+ new = review[: seq_len]
65
+ features[i, :] = np.array(new)
66
+
67
+ return features
68
+
69
+ def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict = vocab_to_int) -> list:
70
+ preprocessed_string = data_preprocessing(input_string)
71
+ result_list = []
72
+ for word in preprocessed_string.split():
73
+ try:
74
+ result_list.append(vocab_to_int[word])
75
+ except KeyError as e:
76
+ print(f'{e}: not in dictionary!')
77
+ result_padded = padding([result_list], seq_len)[0]
78
+
79
+ return torch.tensor(result_padded)
80
+
81
+