Anton commited on
Commit
aade6d7
·
1 Parent(s): 754fbbf

Add application file

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
file/lstm_preprocessing.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import BertTokenizer, BertModel
7
+ from sklearn.linear_model import LogisticRegression
8
+ from nltk.stem import SnowballStemmer
9
+
10
+ from nltk.corpus import stopwords
11
+ stop_words = set(stopwords.words('english'))
12
+ stemmer = SnowballStemmer('russian')
13
+ sw = stopwords.words('russian')
14
+
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
16
+
17
+ class LSTMClassifier(nn.Module):
18
+ def __init__(self, embedding_dim: int, hidden_size:int, embedding: torch.nn.modules.sparse.Embedding) -> None:
19
+ super().__init__()
20
+
21
+ self.embedding_dim = embedding_dim
22
+ self.hidden_size = hidden_size
23
+ self.embedding = embedding
24
+
25
+ self.lstm = nn.LSTM(
26
+ input_size=self.embedding_dim,
27
+ hidden_size=self.hidden_size,
28
+ batch_first=True
29
+ )
30
+ self.clf = nn.Linear(self.hidden_size, 1)
31
+
32
+ def forward(self, x):
33
+ embeddings = self.embedding(x)
34
+ _, (h_n, _) = self.lstm(embeddings)
35
+ out = self.clf(h_n.squeeze())
36
+ return out
37
+
38
+
39
+ def data_preprocessing(text: str) -> str:
40
+ """preprocessing string: lowercase, removing html-tags, punctuation,
41
+ stopwords, digits
42
+
43
+ Args:
44
+ text (str): input string for preprocessing
45
+
46
+ Returns:
47
+ str: preprocessed string
48
+ """
49
+
50
+ text = text.lower()
51
+ text = re.sub('<.*?>', '', text) # html tags
52
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
53
+ text = ' '.join([word for word in text.split() if word not in stop_words])
54
+ text = [word for word in text.split() if not word.isdigit()]
55
+ text = ' '.join(text)
56
+ return text
57
+
58
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
59
+ return list(filter(lambda x: x[1] > n, sorted_words))
60
+
61
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
62
+ """Make left-sided padding for input list of tokens
63
+
64
+ Args:
65
+ review_int (list): input list of tokens
66
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
67
+
68
+ Returns:
69
+ np.array: padded sequences
70
+ """
71
+ features = np.zeros((len(review_int), seq_len), dtype = int)
72
+ for i, review in enumerate(review_int):
73
+ if len(review) <= seq_len:
74
+ zeros = list(np.zeros(seq_len - len(review)))
75
+ new = zeros + review
76
+ else:
77
+ new = review[: seq_len]
78
+ features[i, :] = np.array(new)
79
+
80
+ return features
81
+
82
+ def preprocess_single_string(
83
+ input_string: str,
84
+ seq_len: int,
85
+ vocab_to_int: dict,
86
+ ) -> torch.tensor:
87
+ """Function for all preprocessing steps on a single string
88
+
89
+ Args:
90
+ input_string (str): input single string for preprocessing
91
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
92
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
93
+
94
+ Returns:
95
+ list: preprocessed string
96
+ """
97
+
98
+ preprocessed_string = data_preprocessing(input_string)
99
+ result_list = []
100
+ for word in preprocessed_string.split():
101
+ try:
102
+ result_list.append(vocab_to_int[word])
103
+ except KeyError as e:
104
+ print(f'{e}: not in dictionary!')
105
+ result_padded = padding([result_list], seq_len)[0]
106
+
107
+ return torch.tensor(result_padded)
108
+
109
+ def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict) -> str:
110
+ p_str = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0)
111
+ model.eval()
112
+ pred = model(p_str)
113
+ output = pred.sigmoid().round().item()
114
+ if output == 0:
115
+ return 'Негативный отзыв'
116
+ else:
117
+ return 'Позитивный отзыв'
118
+
119
+ def predict_single_string(text: str,
120
+ model: BertModel,
121
+ loaded_model: LogisticRegression
122
+ ) -> str:
123
+
124
+ with torch.no_grad():
125
+ encoded_input = tokenizer(text, return_tensors='pt')
126
+ output = model(**encoded_input)
127
+ vector = output[0][:,0,:]
128
+ pred0 = loaded_model.predict_proba(vector)[0][0]
129
+ pred1 = loaded_model.predict_proba(vector)[0][1]
130
+ if pred0 > pred1:
131
+ return 'Негативный отзыв'
132
+ else:
133
+ return 'Позитивный отзыв'
134
+
135
+ def clean(text):
136
+
137
+ text = text.lower()
138
+ text = re.sub(r'\s+', ' ', text) # заменить два и более пробела на один пробел
139
+ text = re.sub(r'\d+', ' ', text) # удаляем числа
140
+ text = text.translate(str.maketrans('', '', string.punctuation)) # удаляем знаки пунктуации
141
+ text = re.sub(r'\n+', ' ', text) # удаляем символ перевод строки
142
+
143
+ return text
144
+
145
+ def tokin(text):
146
+ text = clean(text)
147
+ text = ' '.join([stemmer.stem(word) for word in text.split()])
148
+ text = ' '.join([word for word in text.split() if word not in sw])
149
+ return text
150
+
151
+
152
+ def predict_ml_class(text, loaded_vectorizer, loaded_classifier):
153
+
154
+ t = tokin(text).split(' ')
155
+ new_text_bow = loaded_vectorizer.transform(t)
156
+ predicted_label = loaded_classifier.predict(new_text_bow)
157
+ if predicted_label == 0:
158
+ return 'Негативный отзыв'
159
+ else:
160
+ return 'Позитивный отзыв'
function/lstm_preprocessing.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import BertTokenizer, BertModel
7
+ from sklearn.linear_model import LogisticRegression
8
+ from nltk.stem import SnowballStemmer
9
+
10
+ from nltk.corpus import stopwords
11
+ stop_words = set(stopwords.words('english'))
12
+ stemmer = SnowballStemmer('russian')
13
+ sw = stopwords.words('russian')
14
+
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
16
+
17
+ class LSTMClassifier(nn.Module):
18
+ def __init__(self, embedding_dim: int, hidden_size:int, embedding: torch.nn.modules.sparse.Embedding) -> None:
19
+ super().__init__()
20
+
21
+ self.embedding_dim = embedding_dim
22
+ self.hidden_size = hidden_size
23
+ self.embedding = embedding
24
+
25
+ self.lstm = nn.LSTM(
26
+ input_size=self.embedding_dim,
27
+ hidden_size=self.hidden_size,
28
+ batch_first=True
29
+ )
30
+ self.clf = nn.Linear(self.hidden_size, 1)
31
+
32
+ def forward(self, x):
33
+ embeddings = self.embedding(x)
34
+ _, (h_n, _) = self.lstm(embeddings)
35
+ out = self.clf(h_n.squeeze())
36
+ return out
37
+
38
+
39
+ def data_preprocessing(text: str) -> str:
40
+ """preprocessing string: lowercase, removing html-tags, punctuation,
41
+ stopwords, digits
42
+
43
+ Args:
44
+ text (str): input string for preprocessing
45
+
46
+ Returns:
47
+ str: preprocessed string
48
+ """
49
+
50
+ text = text.lower()
51
+ text = re.sub('<.*?>', '', text) # html tags
52
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
53
+ text = ' '.join([word for word in text.split() if word not in stop_words])
54
+ text = [word for word in text.split() if not word.isdigit()]
55
+ text = ' '.join(text)
56
+ return text
57
+
58
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
59
+ return list(filter(lambda x: x[1] > n, sorted_words))
60
+
61
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
62
+ """Make left-sided padding for input list of tokens
63
+
64
+ Args:
65
+ review_int (list): input list of tokens
66
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
67
+
68
+ Returns:
69
+ np.array: padded sequences
70
+ """
71
+ features = np.zeros((len(review_int), seq_len), dtype = int)
72
+ for i, review in enumerate(review_int):
73
+ if len(review) <= seq_len:
74
+ zeros = list(np.zeros(seq_len - len(review)))
75
+ new = zeros + review
76
+ else:
77
+ new = review[: seq_len]
78
+ features[i, :] = np.array(new)
79
+
80
+ return features
81
+
82
+ def preprocess_single_string(
83
+ input_string: str,
84
+ seq_len: int,
85
+ vocab_to_int: dict,
86
+ ) -> torch.tensor:
87
+ """Function for all preprocessing steps on a single string
88
+
89
+ Args:
90
+ input_string (str): input single string for preprocessing
91
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
92
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
93
+
94
+ Returns:
95
+ list: preprocessed string
96
+ """
97
+
98
+ preprocessed_string = data_preprocessing(input_string)
99
+ result_list = []
100
+ for word in preprocessed_string.split():
101
+ try:
102
+ result_list.append(vocab_to_int[word])
103
+ except KeyError as e:
104
+ print(f'{e}: not in dictionary!')
105
+ result_padded = padding([result_list], seq_len)[0]
106
+
107
+ return torch.tensor(result_padded)
108
+
109
+ def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict) -> str:
110
+ p_str = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0)
111
+ model.eval()
112
+ pred = model(p_str)
113
+ output = pred.sigmoid().round().item()
114
+ if output == 0:
115
+ return 'Негативный отзыв'
116
+ else:
117
+ return 'Позитивный отзыв'
118
+
119
+ def predict_single_string(text: str,
120
+ model: BertModel,
121
+ loaded_model: LogisticRegression
122
+ ) -> str:
123
+
124
+ with torch.no_grad():
125
+ encoded_input = tokenizer(text, return_tensors='pt')
126
+ output = model(**encoded_input)
127
+ vector = output[0][:,0,:]
128
+ pred0 = loaded_model.predict_proba(vector)[0][0]
129
+ pred1 = loaded_model.predict_proba(vector)[0][1]
130
+ if pred0 > pred1:
131
+ return 'Негативный отзыв'
132
+ else:
133
+ return 'Позитивный отзыв'
134
+
135
+ def clean(text):
136
+
137
+ text = text.lower()
138
+ text = re.sub(r'\s+', ' ', text) # заменить два и более пробела на один пробел
139
+ text = re.sub(r'\d+', ' ', text) # удаляем числа
140
+ text = text.translate(str.maketrans('', '', string.punctuation)) # удаляем знаки пунктуации
141
+ text = re.sub(r'\n+', ' ', text) # удаляем символ перевод строки
142
+
143
+ return text
144
+
145
+ def tokin(text):
146
+ text = clean(text)
147
+ text = ' '.join([stemmer.stem(word) for word in text.split()])
148
+ text = ' '.join([word for word in text.split() if word not in sw])
149
+ return text
150
+
151
+
152
+ def predict_ml_class(text, loaded_vectorizer, loaded_classifier):
153
+
154
+ t = tokin(text).split(' ')
155
+ new_text_bow = loaded_vectorizer.transform(t)
156
+ predicted_label = loaded_classifier.predict(new_text_bow)
157
+ if predicted_label == 0:
158
+ return 'Негативный отзыв'
159
+ else:
160
+ return 'Позитивный отзыв'
images/lstm_preprocessing.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import BertTokenizer, BertModel
7
+ from sklearn.linear_model import LogisticRegression
8
+ from nltk.stem import SnowballStemmer
9
+
10
+ from nltk.corpus import stopwords
11
+ stop_words = set(stopwords.words('english'))
12
+ stemmer = SnowballStemmer('russian')
13
+ sw = stopwords.words('russian')
14
+
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
16
+
17
+ class LSTMClassifier(nn.Module):
18
+ def __init__(self, embedding_dim: int, hidden_size:int, embedding: torch.nn.modules.sparse.Embedding) -> None:
19
+ super().__init__()
20
+
21
+ self.embedding_dim = embedding_dim
22
+ self.hidden_size = hidden_size
23
+ self.embedding = embedding
24
+
25
+ self.lstm = nn.LSTM(
26
+ input_size=self.embedding_dim,
27
+ hidden_size=self.hidden_size,
28
+ batch_first=True
29
+ )
30
+ self.clf = nn.Linear(self.hidden_size, 1)
31
+
32
+ def forward(self, x):
33
+ embeddings = self.embedding(x)
34
+ _, (h_n, _) = self.lstm(embeddings)
35
+ out = self.clf(h_n.squeeze())
36
+ return out
37
+
38
+
39
+ def data_preprocessing(text: str) -> str:
40
+ """preprocessing string: lowercase, removing html-tags, punctuation,
41
+ stopwords, digits
42
+
43
+ Args:
44
+ text (str): input string for preprocessing
45
+
46
+ Returns:
47
+ str: preprocessed string
48
+ """
49
+
50
+ text = text.lower()
51
+ text = re.sub('<.*?>', '', text) # html tags
52
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
53
+ text = ' '.join([word for word in text.split() if word not in stop_words])
54
+ text = [word for word in text.split() if not word.isdigit()]
55
+ text = ' '.join(text)
56
+ return text
57
+
58
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
59
+ return list(filter(lambda x: x[1] > n, sorted_words))
60
+
61
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
62
+ """Make left-sided padding for input list of tokens
63
+
64
+ Args:
65
+ review_int (list): input list of tokens
66
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
67
+
68
+ Returns:
69
+ np.array: padded sequences
70
+ """
71
+ features = np.zeros((len(review_int), seq_len), dtype = int)
72
+ for i, review in enumerate(review_int):
73
+ if len(review) <= seq_len:
74
+ zeros = list(np.zeros(seq_len - len(review)))
75
+ new = zeros + review
76
+ else:
77
+ new = review[: seq_len]
78
+ features[i, :] = np.array(new)
79
+
80
+ return features
81
+
82
+ def preprocess_single_string(
83
+ input_string: str,
84
+ seq_len: int,
85
+ vocab_to_int: dict,
86
+ ) -> torch.tensor:
87
+ """Function for all preprocessing steps on a single string
88
+
89
+ Args:
90
+ input_string (str): input single string for preprocessing
91
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
92
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
93
+
94
+ Returns:
95
+ list: preprocessed string
96
+ """
97
+
98
+ preprocessed_string = data_preprocessing(input_string)
99
+ result_list = []
100
+ for word in preprocessed_string.split():
101
+ try:
102
+ result_list.append(vocab_to_int[word])
103
+ except KeyError as e:
104
+ print(f'{e}: not in dictionary!')
105
+ result_padded = padding([result_list], seq_len)[0]
106
+
107
+ return torch.tensor(result_padded)
108
+
109
+ def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict) -> str:
110
+ p_str = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0)
111
+ model.eval()
112
+ pred = model(p_str)
113
+ output = pred.sigmoid().round().item()
114
+ if output == 0:
115
+ return 'Негативный отзыв'
116
+ else:
117
+ return 'Позитивный отзыв'
118
+
119
+ def predict_single_string(text: str,
120
+ model: BertModel,
121
+ loaded_model: LogisticRegression
122
+ ) -> str:
123
+
124
+ with torch.no_grad():
125
+ encoded_input = tokenizer(text, return_tensors='pt')
126
+ output = model(**encoded_input)
127
+ vector = output[0][:,0,:]
128
+ pred0 = loaded_model.predict_proba(vector)[0][0]
129
+ pred1 = loaded_model.predict_proba(vector)[0][1]
130
+ if pred0 > pred1:
131
+ return 'Негативный отзыв'
132
+ else:
133
+ return 'Позитивный отзыв'
134
+
135
+ def clean(text):
136
+
137
+ text = text.lower()
138
+ text = re.sub(r'\s+', ' ', text) # заменить два и более пробела на один пробел
139
+ text = re.sub(r'\d+', ' ', text) # удаляем числа
140
+ text = text.translate(str.maketrans('', '', string.punctuation)) # удаляем знаки пунктуации
141
+ text = re.sub(r'\n+', ' ', text) # удаляем символ перевод строки
142
+
143
+ return text
144
+
145
+ def tokin(text):
146
+ text = clean(text)
147
+ text = ' '.join([stemmer.stem(word) for word in text.split()])
148
+ text = ' '.join([word for word in text.split() if word not in sw])
149
+ return text
150
+
151
+
152
+ def predict_ml_class(text, loaded_vectorizer, loaded_classifier):
153
+
154
+ t = tokin(text).split(' ')
155
+ new_text_bow = loaded_vectorizer.transform(t)
156
+ predicted_label = loaded_classifier.predict(new_text_bow)
157
+ if predicted_label == 0:
158
+ return 'Негативный отзыв'
159
+ else:
160
+ return 'Позитивный отзыв'
models/lstm_preprocessing.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import BertTokenizer, BertModel
7
+ from sklearn.linear_model import LogisticRegression
8
+ from nltk.stem import SnowballStemmer
9
+
10
+ from nltk.corpus import stopwords
11
+ stop_words = set(stopwords.words('english'))
12
+ stemmer = SnowballStemmer('russian')
13
+ sw = stopwords.words('russian')
14
+
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
16
+
17
+ class LSTMClassifier(nn.Module):
18
+ def __init__(self, embedding_dim: int, hidden_size:int, embedding: torch.nn.modules.sparse.Embedding) -> None:
19
+ super().__init__()
20
+
21
+ self.embedding_dim = embedding_dim
22
+ self.hidden_size = hidden_size
23
+ self.embedding = embedding
24
+
25
+ self.lstm = nn.LSTM(
26
+ input_size=self.embedding_dim,
27
+ hidden_size=self.hidden_size,
28
+ batch_first=True
29
+ )
30
+ self.clf = nn.Linear(self.hidden_size, 1)
31
+
32
+ def forward(self, x):
33
+ embeddings = self.embedding(x)
34
+ _, (h_n, _) = self.lstm(embeddings)
35
+ out = self.clf(h_n.squeeze())
36
+ return out
37
+
38
+
39
+ def data_preprocessing(text: str) -> str:
40
+ """preprocessing string: lowercase, removing html-tags, punctuation,
41
+ stopwords, digits
42
+
43
+ Args:
44
+ text (str): input string for preprocessing
45
+
46
+ Returns:
47
+ str: preprocessed string
48
+ """
49
+
50
+ text = text.lower()
51
+ text = re.sub('<.*?>', '', text) # html tags
52
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
53
+ text = ' '.join([word for word in text.split() if word not in stop_words])
54
+ text = [word for word in text.split() if not word.isdigit()]
55
+ text = ' '.join(text)
56
+ return text
57
+
58
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
59
+ return list(filter(lambda x: x[1] > n, sorted_words))
60
+
61
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
62
+ """Make left-sided padding for input list of tokens
63
+
64
+ Args:
65
+ review_int (list): input list of tokens
66
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
67
+
68
+ Returns:
69
+ np.array: padded sequences
70
+ """
71
+ features = np.zeros((len(review_int), seq_len), dtype = int)
72
+ for i, review in enumerate(review_int):
73
+ if len(review) <= seq_len:
74
+ zeros = list(np.zeros(seq_len - len(review)))
75
+ new = zeros + review
76
+ else:
77
+ new = review[: seq_len]
78
+ features[i, :] = np.array(new)
79
+
80
+ return features
81
+
82
+ def preprocess_single_string(
83
+ input_string: str,
84
+ seq_len: int,
85
+ vocab_to_int: dict,
86
+ ) -> torch.tensor:
87
+ """Function for all preprocessing steps on a single string
88
+
89
+ Args:
90
+ input_string (str): input single string for preprocessing
91
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
92
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
93
+
94
+ Returns:
95
+ list: preprocessed string
96
+ """
97
+
98
+ preprocessed_string = data_preprocessing(input_string)
99
+ result_list = []
100
+ for word in preprocessed_string.split():
101
+ try:
102
+ result_list.append(vocab_to_int[word])
103
+ except KeyError as e:
104
+ print(f'{e}: not in dictionary!')
105
+ result_padded = padding([result_list], seq_len)[0]
106
+
107
+ return torch.tensor(result_padded)
108
+
109
+ def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict) -> str:
110
+ p_str = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0)
111
+ model.eval()
112
+ pred = model(p_str)
113
+ output = pred.sigmoid().round().item()
114
+ if output == 0:
115
+ return 'Негативный отзыв'
116
+ else:
117
+ return 'Позитивный отзыв'
118
+
119
+ def predict_single_string(text: str,
120
+ model: BertModel,
121
+ loaded_model: LogisticRegression
122
+ ) -> str:
123
+
124
+ with torch.no_grad():
125
+ encoded_input = tokenizer(text, return_tensors='pt')
126
+ output = model(**encoded_input)
127
+ vector = output[0][:,0,:]
128
+ pred0 = loaded_model.predict_proba(vector)[0][0]
129
+ pred1 = loaded_model.predict_proba(vector)[0][1]
130
+ if pred0 > pred1:
131
+ return 'Негативный отзыв'
132
+ else:
133
+ return 'Позитивный отзыв'
134
+
135
+ def clean(text):
136
+
137
+ text = text.lower()
138
+ text = re.sub(r'\s+', ' ', text) # заменить два и более пробела на один пробел
139
+ text = re.sub(r'\d+', ' ', text) # удаляем числа
140
+ text = text.translate(str.maketrans('', '', string.punctuation)) # удаляем знаки пунктуации
141
+ text = re.sub(r'\n+', ' ', text) # удаляем символ перевод строки
142
+
143
+ return text
144
+
145
+ def tokin(text):
146
+ text = clean(text)
147
+ text = ' '.join([stemmer.stem(word) for word in text.split()])
148
+ text = ' '.join([word for word in text.split() if word not in sw])
149
+ return text
150
+
151
+
152
+ def predict_ml_class(text, loaded_vectorizer, loaded_classifier):
153
+
154
+ t = tokin(text).split(' ')
155
+ new_text_bow = loaded_vectorizer.transform(t)
156
+ predicted_label = loaded_classifier.predict(new_text_bow)
157
+ if predicted_label == 0:
158
+ return 'Негативный отзыв'
159
+ else:
160
+ return 'Позитивный отзыв'
pages/lstm_preprocessing.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import BertTokenizer, BertModel
7
+ from sklearn.linear_model import LogisticRegression
8
+ from nltk.stem import SnowballStemmer
9
+
10
+ from nltk.corpus import stopwords
11
+ stop_words = set(stopwords.words('english'))
12
+ stemmer = SnowballStemmer('russian')
13
+ sw = stopwords.words('russian')
14
+
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
16
+
17
+ class LSTMClassifier(nn.Module):
18
+ def __init__(self, embedding_dim: int, hidden_size:int, embedding: torch.nn.modules.sparse.Embedding) -> None:
19
+ super().__init__()
20
+
21
+ self.embedding_dim = embedding_dim
22
+ self.hidden_size = hidden_size
23
+ self.embedding = embedding
24
+
25
+ self.lstm = nn.LSTM(
26
+ input_size=self.embedding_dim,
27
+ hidden_size=self.hidden_size,
28
+ batch_first=True
29
+ )
30
+ self.clf = nn.Linear(self.hidden_size, 1)
31
+
32
+ def forward(self, x):
33
+ embeddings = self.embedding(x)
34
+ _, (h_n, _) = self.lstm(embeddings)
35
+ out = self.clf(h_n.squeeze())
36
+ return out
37
+
38
+
39
+ def data_preprocessing(text: str) -> str:
40
+ """preprocessing string: lowercase, removing html-tags, punctuation,
41
+ stopwords, digits
42
+
43
+ Args:
44
+ text (str): input string for preprocessing
45
+
46
+ Returns:
47
+ str: preprocessed string
48
+ """
49
+
50
+ text = text.lower()
51
+ text = re.sub('<.*?>', '', text) # html tags
52
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
53
+ text = ' '.join([word for word in text.split() if word not in stop_words])
54
+ text = [word for word in text.split() if not word.isdigit()]
55
+ text = ' '.join(text)
56
+ return text
57
+
58
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
59
+ return list(filter(lambda x: x[1] > n, sorted_words))
60
+
61
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
62
+ """Make left-sided padding for input list of tokens
63
+
64
+ Args:
65
+ review_int (list): input list of tokens
66
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
67
+
68
+ Returns:
69
+ np.array: padded sequences
70
+ """
71
+ features = np.zeros((len(review_int), seq_len), dtype = int)
72
+ for i, review in enumerate(review_int):
73
+ if len(review) <= seq_len:
74
+ zeros = list(np.zeros(seq_len - len(review)))
75
+ new = zeros + review
76
+ else:
77
+ new = review[: seq_len]
78
+ features[i, :] = np.array(new)
79
+
80
+ return features
81
+
82
+ def preprocess_single_string(
83
+ input_string: str,
84
+ seq_len: int,
85
+ vocab_to_int: dict,
86
+ ) -> torch.tensor:
87
+ """Function for all preprocessing steps on a single string
88
+
89
+ Args:
90
+ input_string (str): input single string for preprocessing
91
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
92
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
93
+
94
+ Returns:
95
+ list: preprocessed string
96
+ """
97
+
98
+ preprocessed_string = data_preprocessing(input_string)
99
+ result_list = []
100
+ for word in preprocessed_string.split():
101
+ try:
102
+ result_list.append(vocab_to_int[word])
103
+ except KeyError as e:
104
+ print(f'{e}: not in dictionary!')
105
+ result_padded = padding([result_list], seq_len)[0]
106
+
107
+ return torch.tensor(result_padded)
108
+
109
+ def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict) -> str:
110
+ p_str = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0)
111
+ model.eval()
112
+ pred = model(p_str)
113
+ output = pred.sigmoid().round().item()
114
+ if output == 0:
115
+ return 'Негативный отзыв'
116
+ else:
117
+ return 'Позитивный отзыв'
118
+
119
+ def predict_single_string(text: str,
120
+ model: BertModel,
121
+ loaded_model: LogisticRegression
122
+ ) -> str:
123
+
124
+ with torch.no_grad():
125
+ encoded_input = tokenizer(text, return_tensors='pt')
126
+ output = model(**encoded_input)
127
+ vector = output[0][:,0,:]
128
+ pred0 = loaded_model.predict_proba(vector)[0][0]
129
+ pred1 = loaded_model.predict_proba(vector)[0][1]
130
+ if pred0 > pred1:
131
+ return 'Негативный отзыв'
132
+ else:
133
+ return 'Позитивный отзыв'
134
+
135
+ def clean(text):
136
+
137
+ text = text.lower()
138
+ text = re.sub(r'\s+', ' ', text) # заменить два и более пробела на один пробел
139
+ text = re.sub(r'\d+', ' ', text) # удаляем числа
140
+ text = text.translate(str.maketrans('', '', string.punctuation)) # удаляем знаки пунктуации
141
+ text = re.sub(r'\n+', ' ', text) # удаляем символ перевод строки
142
+
143
+ return text
144
+
145
+ def tokin(text):
146
+ text = clean(text)
147
+ text = ' '.join([stemmer.stem(word) for word in text.split()])
148
+ text = ' '.join([word for word in text.split() if word not in sw])
149
+ return text
150
+
151
+
152
+ def predict_ml_class(text, loaded_vectorizer, loaded_classifier):
153
+
154
+ t = tokin(text).split(' ')
155
+ new_text_bow = loaded_vectorizer.transform(t)
156
+ predicted_label = loaded_classifier.predict(new_text_bow)
157
+ if predicted_label == 0:
158
+ return 'Негативный отзыв'
159
+ else:
160
+ return 'Позитивный отзыв'