osheina commited on
Commit
b6aa467
1 Parent(s): fed8986

Upload 16 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
37
+ healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
38
+ healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
BERTmodel_weights2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
3
+ size 116986906
bert_file.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel
2
+ from torch import nn
3
+
4
+ class BERTClassifier(nn.Module):
5
+ def __init__(self, bert_path="cointegrated/rubert-tiny2"):
6
+ super().__init__()
7
+ self.bert = AutoModel.from_pretrained(bert_path)
8
+ for param in self.bert.parameters():
9
+ param.requires_grad = False
10
+ self.linear = nn.Sequential(
11
+ nn.Linear(312, 150),
12
+ nn.Dropout(0.1),
13
+ nn.ReLU(),
14
+ nn.Linear(150, 1),
15
+ nn.Sigmoid()
16
+ )
17
+
18
+ def forward(self, x, masks):
19
+ bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
20
+ out = self.linear(bert_out)
21
+ return out
bert_strim.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ from bert_file import BERTClassifier
5
+ import numpy as np
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
8
+ model = BERTClassifier()
9
+ device = 'cpu'
10
+
11
+ model.load_state_dict(torch.load('BERTmodel_weights2.pth',map_location=torch.device('cpu')))
12
+ model.eval()
13
+
14
+ @st.cache_data
15
+ def predict_sentiment(text):
16
+ MAX_LEN = 100
17
+ encoded_review = tokenizer.encode_plus(
18
+ text,
19
+ max_length=MAX_LEN,
20
+ add_special_tokens=True,
21
+ return_token_type_ids=False,
22
+ pad_to_max_length=True,
23
+ return_attention_mask=True,
24
+ return_tensors='pt',
25
+ )
26
+ input_ids = encoded_review['input_ids'].to(device)
27
+ attention_mask = encoded_review['attention_mask'].to(device)
28
+
29
+ with torch.no_grad():
30
+ output = model(input_ids, attention_mask)
31
+ prediction = torch.round(output).cpu().numpy()[0][0]
32
+ if prediction == 1:
33
+ return "Позитивный отзыв 😀"
34
+ else:
35
+ return "Негативный отзыв 😟"
36
+
37
+ def bert_model_page():
38
+ st.title("Классификатор отзывов")
39
+ user_input = st.text_area("Введите отзыв:")
40
+ if st.button("Классифицировать"):
41
+ if user_input:
42
+ prediction = predict_sentiment(user_input)
43
+ st.write(prediction)
44
+ else:
45
+ st.write("Пожалуйста, введите отзыв для классификации.")
cat_model4.cbm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
3
+ size 1135408
common_file.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import requests
4
+ import time
5
+ import numpy as np
6
+ import os
7
+ from toxic1 import toxicity_page
8
+ from strim_nlp import classic_ml_page
9
+ from lstm import lstm_model_page
10
+ from bert_strim import bert_model_page
11
+
12
+
13
+ def app_description_page():
14
+ st.title("Welcome to My App!")
15
+ st.write("This is a Streamlit application where you can explore two different models.")
16
+
17
+ def model_selection_page():
18
+ st.sidebar.title("Model Selection")
19
+ selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
20
+
21
+ if selected_model == "Classic ML":
22
+ classic_ml_page()
23
+ st.write("You selected Classic ML.")
24
+ elif selected_model == "LSTM":
25
+ lstm_model_page()
26
+ st.write("You selected LSTM.")
27
+ elif selected_model == "BERT":
28
+ bert_model_page()
29
+ st.write("You selected BERT.")
30
+
31
+ def main():
32
+ page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
33
+
34
+ if page == "App Description":
35
+ app_description_page()
36
+ elif page == "Model Selection":
37
+ model_selection_page()
38
+ elif page == "Toxicity Model":
39
+ toxicity_page()
40
+
41
+ if __name__ == "__main__":
42
+ main()
final_model_bah.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
3
+ size 1506113
healthcare_facilities_reviews.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
3
+ size 79002044
healthcare_facilities_reviews.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
3
+ size 95300708
lstm.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import re
4
+ import json
5
+ from nltk.corpus import stopwords
6
+ from model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
7
+ from nltk.corpus import stopwords
8
+ stop_words = set(stopwords.words('russian'))
9
+
10
+ # Load vocabulary mapping
11
+ with open('vocab_to_int.json', 'r') as file:
12
+ vocab_to_int = json.load(file)
13
+
14
+ # Load the pre-trained model
15
+ SEQ_LEN = 96
16
+ model_bah = LSTMBahdanauAttention()
17
+ # Set the new vocabulary size in the model
18
+ model_bah.load_state_dict(torch.load('final_model_bah.pth'))
19
+ model_bah.eval()
20
+
21
+ # Function to analyze sentiment
22
+ def analyze_sentiment(text):
23
+ preprocessed_text = data_preprocessing(text)
24
+ sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
25
+
26
+ with torch.no_grad():
27
+ probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
28
+ return probability
29
+
30
+ # Streamlit UI
31
+ def lstm_model_page():
32
+ st.title("Классификация отзывов лечебных учреждений")
33
+ user_input = st.text_area("Введите ваш отзыв:")
34
+ if st.button("Классифицировать"):
35
+ probability = analyze_sentiment(user_input)
36
+ if probability > 0.5:
37
+ st.write("Отзыв положительный 🌟")
38
+ else:
39
+ st.write("Отзыв отрицательный 😞")
model_file.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('russian'))
8
+ from collections import Counter
9
+ from gensim.models import Word2Vec
10
+ import pandas as pd
11
+ import torch.nn.functional as F
12
+
13
+
14
+ HIDDEN_SIZE = 32
15
+ SEQ_LEN = 32
16
+ df = pd.read_json('healthcare_facilities_reviews.jsonl', lines=True)
17
+
18
+ def data_preprocessing(text: str) -> str:
19
+ text = text.lower()
20
+ text = re.sub('<.*?>', '', text) # html tags
21
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
22
+ text = ' '.join([word for word in text.split() if word not in stop_words])
23
+ text = [word for word in text.split() if not word.isdigit()]
24
+ text = ' '.join(text)
25
+ return text
26
+
27
+ contents = df['content'].tolist()
28
+ preprocessed = [data_preprocessing(content) for content in contents]
29
+
30
+ corpus = [word for text in preprocessed for word in text.split()]
31
+ sorted_words = Counter(corpus).most_common()
32
+
33
+ def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
34
+ return list(filter(lambda x: x[1] > n, sorted_words))
35
+
36
+ sorted_words = get_words_by_freq(sorted_words, 100)
37
+ sorted_words[-10:]
38
+
39
+ vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
40
+
41
+ reviews_int = []
42
+ for text in preprocessed:
43
+ r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
44
+ reviews_int.append(r)
45
+
46
+ w2v_input = []
47
+ for review in preprocessed:
48
+ cur_review = []
49
+ for word in review.split():
50
+ if vocab_to_int.get(word):
51
+ cur_review.append(word)
52
+ w2v_input.append(cur_review)
53
+
54
+ VOCAB_SIZE = len(vocab_to_int) + 1
55
+
56
+ EMBEDDING_DIM = 64
57
+
58
+ wv = Word2Vec(
59
+ min_count=1, # минимальная встречаемость в корпусе
60
+ vector_size=EMBEDDING_DIM # размерность вектора для слова
61
+ )
62
+ wv.build_vocab(w2v_input)
63
+
64
+ wv.train(
65
+ corpus_iterable=w2v_input,
66
+ total_examples=wv.corpus_count,
67
+ epochs=10
68
+ )
69
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
70
+
71
+ # Бежим по всем словам словаря: если слово есть, достаем его вектор
72
+ # если слова нет, то распечатываем его и пропускаем
73
+ for word, i in vocab_to_int.items():
74
+ try:
75
+ embedding_vector = wv.wv[word]
76
+ embedding_matrix[i] = embedding_vector
77
+ except KeyError as e:
78
+ pass
79
+ print(f'{e}: word: {word}')
80
+
81
+ # Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
82
+ embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
83
+
84
+ def data_preprocessing(text: str) -> str:
85
+ text = text.lower()
86
+ text = re.sub('<.*?>', '', text) # html tags
87
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
88
+ text = ' '.join([word for word in text.split() if word not in stop_words])
89
+ text = [word for word in text.split() if not word.isdigit()]
90
+ text = ' '.join(text)
91
+ return text
92
+
93
+
94
+
95
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
96
+ features = np.zeros((len(review_int), seq_len), dtype = int)
97
+ for i, review in enumerate(review_int):
98
+ if len(review) <= seq_len:
99
+ zeros = list(np.zeros(seq_len - len(review)))
100
+ new = zeros + review
101
+ else:
102
+ new = review[: seq_len]
103
+ features[i, :] = np.array(new)
104
+
105
+ return features
106
+
107
+ def preprocess_single_string(
108
+ input_string: str,
109
+ seq_len: int,
110
+ vocab_to_int: dict,
111
+ verbose : bool = False
112
+ ) -> torch.tensor:
113
+ preprocessed_string = data_preprocessing(input_string)
114
+ result_list = []
115
+ for word in preprocessed_string.split():
116
+ try:
117
+ result_list.append(vocab_to_int[word])
118
+ except KeyError as e:
119
+ if verbose:
120
+ print(f'{e}: not in dictionary!')
121
+ pass
122
+ result_padded = padding([result_list], seq_len)[0]
123
+
124
+ return torch.tensor(result_padded)
125
+
126
+ class BahdanauAttention(nn.Module):
127
+ def __init__(
128
+ self,
129
+ hidden_size: int = HIDDEN_SIZE
130
+ ) -> None:
131
+
132
+ super().__init__()
133
+ self.hidden_size = hidden_size
134
+ self.W = nn.Linear(hidden_size, hidden_size)
135
+ self.U = nn.Linear(hidden_size, hidden_size)
136
+ self.V = nn.Linear(hidden_size, 1)
137
+ self.tanh = nn.Tanh()
138
+
139
+ def forward(
140
+ self,
141
+ keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
142
+ query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
143
+ ):
144
+
145
+ query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
146
+ r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
147
+
148
+ r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
149
+
150
+ scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
151
+ scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
152
+ att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
153
+ context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
154
+ return context, att_weights
155
+
156
+ class LSTMBahdanauAttention(nn.Module):
157
+ def __init__(self) -> None:
158
+ super().__init__()
159
+
160
+ # self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
161
+ self.embedding = embedding_layer
162
+ self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
163
+ self.attn = BahdanauAttention(HIDDEN_SIZE)
164
+ self.clf = nn.Sequential(
165
+ nn.Linear(HIDDEN_SIZE, 128),
166
+ nn.Dropout(),
167
+ nn.Tanh(),
168
+ nn.Linear(128, 1)
169
+ )
170
+
171
+ def forward(self, x):
172
+ embeddings = self.embedding(x)
173
+ outputs, (h_n, _) = self.lstm(embeddings)
174
+ context, att_weights = self.attn(outputs, h_n.squeeze(0))
175
+ out = self.clf(context)
176
+ return out, att_weights
requirements.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==23.2.0
3
+ blinker==1.7.0
4
+ cachetools==5.3.3
5
+ catboost==1.2.3
6
+ certifi==2024.2.2
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ contourpy==1.2.1
10
+ cycler==0.12.1
11
+ filelock==3.13.4
12
+ fonttools==4.51.0
13
+ fsspec==2024.3.1
14
+ gensim==4.3.2
15
+ gitdb==4.0.11
16
+ GitPython==3.1.43
17
+ graphviz==0.20.3
18
+ huggingface-hub==0.22.2
19
+ idna==3.7
20
+ Jinja2==3.1.3
21
+ joblib==1.4.0
22
+ jsonschema==4.21.1
23
+ jsonschema-specifications==2023.12.1
24
+ kiwisolver==1.4.5
25
+ markdown-it-py==3.0.0
26
+ MarkupSafe==2.1.5
27
+ matplotlib==3.8.4
28
+ mdurl==0.1.2
29
+ mpmath==1.3.0
30
+ networkx==3.3
31
+ nltk==3.8.1
32
+ numpy==1.26.4
33
+ packaging==24.0
34
+ pandas==2.2.2
35
+ pillow==10.3.0
36
+ plotly==5.20.0
37
+ protobuf==4.25.3
38
+ pyarrow==15.0.2
39
+ pydeck==0.8.1b0
40
+ Pygments==2.17.2
41
+ pymystem3==0.2.0
42
+ pyparsing==3.1.2
43
+ python-dateutil==2.9.0.post0
44
+ pytz==2024.1
45
+ PyYAML==6.0.1
46
+ referencing==0.34.0
47
+ regex==2023.12.25
48
+ requests==2.31.0
49
+ rich==13.7.1
50
+ rpds-py==0.18.0
51
+ safetensors==0.4.2
52
+ scikit-learn==1.4.2
53
+ scipy==1.13.0
54
+ six==1.16.0
55
+ smart-open==7.0.4
56
+ smmap==5.0.1
57
+ streamlit==1.33.0
58
+ sympy==1.12
59
+ tenacity==8.2.3
60
+ threadpoolctl==3.4.0
61
+ tokenizers==0.15.2
62
+ toml==0.10.2
63
+ toolz==0.12.1
64
+ torch==2.2.2
65
+ tornado==6.4
66
+ tqdm==4.66.2
67
+ transformers==4.39.3
68
+ typing_extensions==4.11.0
69
+ tzdata==2024.1
70
+ urllib3==2.2.1
71
+ wrapt==1.16.0
rnn_preprocessing.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('russian'))
8
+
9
+ def data_preprocessing(text: str) -> str:
10
+ """preprocessing string: lowercase, removing html-tags, punctuation,
11
+ stopwords, digits
12
+
13
+ Args:
14
+ text (str): input string for preprocessing
15
+
16
+ Returns:
17
+ str: preprocessed string
18
+ """
19
+
20
+ text = text.lower()
21
+ text = re.sub('<.*?>', '', text) # html tags
22
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
23
+ text = ' '.join([word for word in text.split() if word not in stop_words])
24
+ text = [word for word in text.split() if not word.isdigit()]
25
+ text = ' '.join(text)
26
+ return text
27
+
28
+ def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
29
+ return list(filter(lambda x: x[1] > n, sorted_words))
30
+
31
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
32
+ """Make left-sided padding for input list of tokens
33
+
34
+ Args:
35
+ review_int (list): input list of tokens
36
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
37
+
38
+ Returns:
39
+ np.array: padded sequences
40
+ """
41
+ features = np.zeros((len(review_int), seq_len), dtype = int)
42
+ for i, review in enumerate(review_int):
43
+ if len(review) <= seq_len:
44
+ zeros = list(np.zeros(seq_len - len(review)))
45
+ new = zeros + review
46
+ else:
47
+ new = review[: seq_len]
48
+ features[i, :] = np.array(new)
49
+
50
+ return features
51
+
52
+ def preprocess_single_string(
53
+ input_string: str,
54
+ seq_len: int,
55
+ vocab_to_int: dict,
56
+ verbose : bool = False
57
+ ) -> torch.tensor:
58
+ """Function for all preprocessing steps on a single string
59
+
60
+ Args:
61
+ input_string (str): input single string for preprocessing
62
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
63
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
64
+
65
+ Returns:
66
+ list: preprocessed string
67
+ """
68
+
69
+ preprocessed_string = data_preprocessing(input_string)
70
+ result_list = []
71
+ for word in preprocessed_string.split():
72
+ try:
73
+ result_list.append(vocab_to_int[word])
74
+ except KeyError as e:
75
+ if verbose:
76
+ print(f'{e}: not in dictionary!')
77
+ pass
78
+ result_padded = padding([result_list], seq_len)[0]
79
+
80
+ return torch.tensor(result_padded)
strim_nlp.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import catboost
4
+ from catboost import CatBoostClassifier
5
+ import re
6
+ import string
7
+ from nltk.corpus import stopwords
8
+ from pymystem3 import Mystem
9
+ from joblib import load
10
+ import nltk
11
+
12
+ def data_preprocessing(text):
13
+ stop_words = set(stopwords.words('russian'))
14
+ text = text.lower()
15
+ text = re.sub("<.*?>", "", text)
16
+ text = re.sub(r'http\S+', " ", text)
17
+ text = re.sub(r'@\w+', ' ', text)
18
+ text = re.sub(r'#\w+', ' ', text)
19
+ text = re.sub(r'\d+', ' ', text)
20
+ text = "".join([c for c in text if c not in string.punctuation])
21
+ return " ".join([word for word in text.split() if word not in stop_words])
22
+
23
+ def lemmatize_text(text):
24
+ mystem = Mystem()
25
+ lemmas = mystem.lemmatize(text)
26
+ return ' '.join(lemmas)
27
+
28
+ model = CatBoostClassifier()
29
+ model.load_model('cat_model4.cbm')
30
+
31
+ tfidf_vectorizer = load('tfidf_vectorizer.joblib')
32
+
33
+ def classic_ml_page():
34
+ st.title("Классификация отзывов о медицинских учреждениях")
35
+ user_review = st.text_area("Введите ваш отзыв здесь:")
36
+
37
+ if st.button("Классифицировать"):
38
+ if user_review:
39
+ preprocessed_review = data_preprocessing(user_review)
40
+ lemmatized_review = lemmatize_text(preprocessed_review)
41
+ vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
42
+ prediction = model.predict(vectorized_review)
43
+
44
+ if prediction[0] == 1:
45
+ st.write("Позитивный отзыв 😀")
46
+ else:
47
+ st.write("Негативный отзыв 😟")
48
+ else:
49
+ st.write("Пожалуйста, введите отзыв для классификации.")
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
3
+ size 1750676
toxic1.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # toxic.py
2
+ import streamlit as st
3
+ import numpy as np
4
+ import pandas as pd
5
+ import time
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+ model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
10
+ tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
11
+ model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
12
+
13
+ def text2toxicity(text, aggregate=True):
14
+ with torch.no_grad():
15
+ inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
16
+ proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
17
+ if isinstance(text, str):
18
+ proba = proba[0]
19
+ if aggregate:
20
+ return 1 - proba.T[0] * (1 - proba.T[-1])
21
+ return proba
22
+
23
+ def toxicity_page():
24
+ st.title("""
25
+ Определим токсичный комментарий или нет
26
+ """)
27
+
28
+ user_text_input = st.text_area('Введите ваш отзыв здесь:')
29
+
30
+ if st.button('Предсказать'):
31
+ start_time = time.time()
32
+ proba = text2toxicity(user_text_input, True)
33
+ end_time = time.time()
34
+ prediction_time = end_time - start_time
35
+
36
+ if proba >= 0.5:
37
+ st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
38
+ else:
39
+ st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
40
+ st.write(f'Время предсказания: {prediction_time:.4f} секунд')
vocab_to_int.json ADDED
The diff for this file is too large to render. See raw diff