Veronika1101 commited on
Commit
d15a7ed
1 Parent(s): f8f4553

Upload 20 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png filter=lfs diff=lfs merge=lfs -text
37
+ Data/healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
38
+ Data/healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ Weights/cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
Data/20182704132259.jpg ADDED
Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg ADDED
Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png ADDED

Git LFS Details

  • SHA256: 835384bd33c9055373d50ba319e0d38f9411ae6c9867f69b6fc017fc3aa220f5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.14 MB
Data/healthcare_facilities_reviews.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
3
+ size 79002044
Data/healthcare_facilities_reviews.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
3
+ size 95300708
Data/maxresdefault.jpg ADDED
Models/bert_file.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel
2
+ from torch import nn
3
+
4
+ class BERTClassifier(nn.Module):
5
+ def __init__(self, bert_path="cointegrated/rubert-tiny2"):
6
+ super().__init__()
7
+ self.bert = AutoModel.from_pretrained(bert_path)
8
+ for param in self.bert.parameters():
9
+ param.requires_grad = False
10
+ self.linear = nn.Sequential(
11
+ nn.Linear(312, 150),
12
+ nn.Dropout(0.1),
13
+ nn.ReLU(),
14
+ nn.Linear(150, 1),
15
+ nn.Sigmoid()
16
+ )
17
+
18
+ def forward(self, x, masks):
19
+ bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
20
+ out = self.linear(bert_out)
21
+ return out
Models/bert_strim.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ from Models.bert_file import BERTClassifier
5
+ import numpy as np
6
+ import time
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
9
+ model = BERTClassifier()
10
+ device = 'cpu'
11
+
12
+ model.load_state_dict(torch.load('Weights/BERTmodel_weights2.pth',map_location=torch.device('cpu')))
13
+ model.eval()
14
+
15
+ @st.cache_data
16
+ def predict_sentiment(text):
17
+ MAX_LEN = 100
18
+ encoded_review = tokenizer.encode_plus(
19
+ text,
20
+ max_length=MAX_LEN,
21
+ add_special_tokens=True,
22
+ return_token_type_ids=False,
23
+ pad_to_max_length=True,
24
+ return_attention_mask=True,
25
+ return_tensors='pt',
26
+ )
27
+ input_ids = encoded_review['input_ids'].to(device)
28
+ attention_mask = encoded_review['attention_mask'].to(device)
29
+
30
+ with torch.no_grad():
31
+ output = model(input_ids, attention_mask)
32
+ prediction = torch.round(output).cpu().numpy()[0][0]
33
+ if prediction == 1:
34
+ return "Позитивный отзыв 😀"
35
+ else:
36
+ return "Негативный отзыв 😟"
37
+
38
+ def bert_model_page():
39
+ st.title("Классификация отзывов")
40
+ user_input = st.text_area("Введите отзыв:")
41
+ if st.button("Классифицировать"):
42
+ if user_input:
43
+ start_time = time.time()
44
+ prediction = predict_sentiment(user_input)
45
+ end_time = time.time()
46
+ execution_time = end_time - start_time
47
+ st.write("Результат классификации:", prediction)
48
+ st.write(f'Время предсказания: {execution_time:.4f} секунд')
49
+ else:
50
+ st.write("Пожалуйста, введите отзыв для классификации.")
Models/lstm.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import re
4
+ import json
5
+ import time
6
+ from nltk.corpus import stopwords
7
+ from Models.model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
8
+ from nltk.corpus import stopwords
9
+ stop_words = set(stopwords.words('russian'))
10
+
11
+ with open('Weights/vocab_to_int.json', 'r') as file:
12
+ vocab_to_int = json.load(file)
13
+
14
+ SEQ_LEN = 96
15
+ model_bah = LSTMBahdanauAttention()
16
+ model_bah.load_state_dict(torch.load('Weights/final_model_bah.pth'))
17
+ model_bah.eval()
18
+
19
+ def analyze_sentiment(text):
20
+ preprocessed_text = data_preprocessing(text)
21
+ sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
22
+ with torch.no_grad():
23
+ probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
24
+ return probability
25
+
26
+ def lstm_model_page():
27
+ st.title("Классификация отзывов")
28
+ user_input = st.text_area("Введите ваш отзыв:")
29
+ if st.button("Классифицировать"):
30
+ start_time = time.time()
31
+ probability = analyze_sentiment(user_input)
32
+ end_time = time.time()
33
+ execution_time = end_time - start_time
34
+ if probability > 0.5:
35
+ st.write("Отзыв положительный 🌟")
36
+ else:
37
+ st.write("Отзыв отрицательный 😞")
38
+ st.write(f'Время предсказания: {execution_time:.4f} секунд')
Models/model_file.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('russian'))
8
+ from collections import Counter
9
+ from gensim.models import Word2Vec
10
+ import pandas as pd
11
+ import torch.nn.functional as F
12
+
13
+
14
+ HIDDEN_SIZE = 32
15
+ SEQ_LEN = 32
16
+ df = pd.read_json('Data/healthcare_facilities_reviews.jsonl', lines=True)
17
+
18
+ def data_preprocessing(text: str) -> str:
19
+ text = text.lower()
20
+ text = re.sub('<.*?>', '', text) # html tags
21
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
22
+ text = ' '.join([word for word in text.split() if word not in stop_words])
23
+ text = [word for word in text.split() if not word.isdigit()]
24
+ text = ' '.join(text)
25
+ return text
26
+
27
+ contents = df['content'].tolist()
28
+ preprocessed = [data_preprocessing(content) for content in contents]
29
+
30
+ corpus = [word for text in preprocessed for word in text.split()]
31
+ sorted_words = Counter(corpus).most_common()
32
+
33
+ def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
34
+ return list(filter(lambda x: x[1] > n, sorted_words))
35
+
36
+ sorted_words = get_words_by_freq(sorted_words, 100)
37
+ sorted_words[-10:]
38
+
39
+ vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
40
+
41
+ reviews_int = []
42
+ for text in preprocessed:
43
+ r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
44
+ reviews_int.append(r)
45
+
46
+ w2v_input = []
47
+ for review in preprocessed:
48
+ cur_review = []
49
+ for word in review.split():
50
+ if vocab_to_int.get(word):
51
+ cur_review.append(word)
52
+ w2v_input.append(cur_review)
53
+
54
+ VOCAB_SIZE = len(vocab_to_int) + 1
55
+
56
+ EMBEDDING_DIM = 64
57
+
58
+ wv = Word2Vec(
59
+ min_count=1,
60
+ vector_size=EMBEDDING_DIM
61
+ )
62
+ wv.build_vocab(w2v_input)
63
+
64
+ wv.train(
65
+ corpus_iterable=w2v_input,
66
+ total_examples=wv.corpus_count,
67
+ epochs=10
68
+ )
69
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
70
+
71
+ # Бежим по всем словам словаря: если слово есть, достаем его вектор
72
+ # если слова нет, то распечатываем его и пропускаем
73
+ for word, i in vocab_to_int.items():
74
+ try:
75
+ embedding_vector = wv.wv[word]
76
+ embedding_matrix[i] = embedding_vector
77
+ except KeyError as e:
78
+ pass
79
+ print(f'{e}: word: {word}')
80
+
81
+ # Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
82
+ embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
83
+
84
+ def data_preprocessing(text: str) -> str:
85
+ text = text.lower()
86
+ text = re.sub('<.*?>', '', text) # html tags
87
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
88
+ text = ' '.join([word for word in text.split() if word not in stop_words])
89
+ text = [word for word in text.split() if not word.isdigit()]
90
+ text = ' '.join(text)
91
+ return text
92
+
93
+
94
+
95
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
96
+ features = np.zeros((len(review_int), seq_len), dtype = int)
97
+ for i, review in enumerate(review_int):
98
+ if len(review) <= seq_len:
99
+ zeros = list(np.zeros(seq_len - len(review)))
100
+ new = zeros + review
101
+ else:
102
+ new = review[: seq_len]
103
+ features[i, :] = np.array(new)
104
+
105
+ return features
106
+
107
+ def preprocess_single_string(
108
+ input_string: str,
109
+ seq_len: int,
110
+ vocab_to_int: dict,
111
+ verbose : bool = False
112
+ ) -> torch.tensor:
113
+ preprocessed_string = data_preprocessing(input_string)
114
+ result_list = []
115
+ for word in preprocessed_string.split():
116
+ try:
117
+ result_list.append(vocab_to_int[word])
118
+ except KeyError as e:
119
+ if verbose:
120
+ print(f'{e}: not in dictionary!')
121
+ pass
122
+ result_padded = padding([result_list], seq_len)[0]
123
+
124
+ return torch.tensor(result_padded)
125
+
126
+ class BahdanauAttention(nn.Module):
127
+ def __init__(
128
+ self,
129
+ hidden_size: int = HIDDEN_SIZE
130
+ ) -> None:
131
+
132
+ super().__init__()
133
+ self.hidden_size = hidden_size
134
+ self.W = nn.Linear(hidden_size, hidden_size)
135
+ self.U = nn.Linear(hidden_size, hidden_size)
136
+ self.V = nn.Linear(hidden_size, 1)
137
+ self.tanh = nn.Tanh()
138
+
139
+ def forward(
140
+ self,
141
+ keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
142
+ query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
143
+ ):
144
+
145
+ query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
146
+ r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
147
+
148
+ r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
149
+
150
+ scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
151
+ scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
152
+ att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
153
+ context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
154
+ return context, att_weights
155
+
156
+ class LSTMBahdanauAttention(nn.Module):
157
+ def __init__(self) -> None:
158
+ super().__init__()
159
+
160
+ # self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
161
+ self.embedding = embedding_layer
162
+ self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
163
+ self.attn = BahdanauAttention(HIDDEN_SIZE)
164
+ self.clf = nn.Sequential(
165
+ nn.Linear(HIDDEN_SIZE, 128),
166
+ nn.Dropout(),
167
+ nn.Tanh(),
168
+ nn.Linear(128, 1)
169
+ )
170
+
171
+ def forward(self, x):
172
+ embeddings = self.embedding(x)
173
+ outputs, (h_n, _) = self.lstm(embeddings)
174
+ context, att_weights = self.attn(outputs, h_n.squeeze(0))
175
+ out = self.clf(context)
176
+ return out, att_weights
Models/rnn_preprocessing.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('russian'))
8
+
9
+ def data_preprocessing(text: str) -> str:
10
+ text = text.lower()
11
+ text = re.sub('<.*?>', '', text)
12
+ text = ''.join([c for c in text if c not in string.punctuation])
13
+ text = ' '.join([word for word in text.split() if word not in stop_words])
14
+ text = [word for word in text.split() if not word.isdigit()]
15
+ text = ' '.join(text)
16
+ return text
17
+
18
+ def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
19
+ return list(filter(lambda x: x[1] > n, sorted_words))
20
+
21
+ def padding(review_int: list, seq_len: int) -> np.array:
22
+ features = np.zeros((len(review_int), seq_len), dtype = int)
23
+ for i, review in enumerate(review_int):
24
+ if len(review) <= seq_len:
25
+ zeros = list(np.zeros(seq_len - len(review)))
26
+ new = zeros + review
27
+ else:
28
+ new = review[: seq_len]
29
+ features[i, :] = np.array(new)
30
+
31
+ return features
32
+
33
+ def preprocess_single_string(
34
+ input_string: str,
35
+ seq_len: int,
36
+ vocab_to_int: dict,
37
+ verbose : bool = False
38
+ ) -> torch.tensor:
39
+ preprocessed_string = data_preprocessing(input_string)
40
+ result_list = []
41
+ for word in preprocessed_string.split():
42
+ try:
43
+ result_list.append(vocab_to_int[word])
44
+ except KeyError as e:
45
+ if verbose:
46
+ print(f'{e}: not in dictionary!')
47
+ pass
48
+ result_padded = padding([result_list], seq_len)[0]
49
+
50
+ return torch.tensor(result_padded)
Models/strim_nlp.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import catboost
4
+ from catboost import CatBoostClassifier
5
+ import re
6
+ import string
7
+ from nltk.corpus import stopwords
8
+ from pymystem3 import Mystem
9
+ from joblib import load
10
+ import nltk
11
+ nltk.download('stopwords')
12
+ import time
13
+
14
+ def data_preprocessing(text):
15
+ stop_words = set(stopwords.words('russian'))
16
+ text = text.lower()
17
+ text = re.sub("<.*?>", "", text)
18
+ text = re.sub(r'http\S+', " ", text)
19
+ text = re.sub(r'@\w+', ' ', text)
20
+ text = re.sub(r'#\w+', ' ', text)
21
+ text = re.sub(r'\d+', ' ', text)
22
+ text = "".join([c for c in text if c not in string.punctuation])
23
+ return " ".join([word for word in text.split() if word not in stop_words])
24
+
25
+ def lemmatize_text(text):
26
+ mystem = Mystem()
27
+ lemmas = mystem.lemmatize(text)
28
+ return ' '.join(lemmas)
29
+
30
+ model = CatBoostClassifier()
31
+ model.load_model('Weights/cat_model4.cbm')
32
+
33
+ tfidf_vectorizer = load('Weights/tfidf_vectorizer.joblib')
34
+
35
+ def classic_ml_page():
36
+ st.title("Классификация отзывов")
37
+ user_review = st.text_area("Введите ваш отзыв здесь:")
38
+
39
+ if st.button("Классифицировать"):
40
+ if user_review:
41
+ preprocessed_review = data_preprocessing(user_review)
42
+ lemmatized_review = lemmatize_text(preprocessed_review)
43
+ vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
44
+ start_time = time.time()
45
+ prediction = model.predict(vectorized_review)
46
+ end_time = time.time()
47
+ execution_time = end_time - start_time
48
+ if prediction[0] == 1:
49
+ st.write("Позитивный отзыв 😀")
50
+ else:
51
+ st.write("Негативный отзыв 😟")
52
+ st.write(f'Время предсказания: {execution_time:.4f} секунд')
53
+ else:
54
+ st.write("Пожалуйста, введите отзыв для классификации.")
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
Models/toxic1.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # toxic.py
2
+ import streamlit as st
3
+ import numpy as np
4
+ import pandas as pd
5
+ import time
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+ model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
10
+ tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
11
+ model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
12
+
13
+ def text2toxicity(text, aggregate=True):
14
+ with torch.no_grad():
15
+ inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
16
+ proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
17
+ if isinstance(text, str):
18
+ proba = proba[0]
19
+ if aggregate:
20
+ return 1 - proba.T[0] * (1 - proba.T[-1])
21
+ return proba
22
+
23
+ def toxicity_page():
24
+ st.title("""
25
+ Определим токсичный комментарий или нет
26
+ """)
27
+ user_text_input = st.text_area('Введите ваш отзыв здесь:')
28
+
29
+ if st.button('Предсказать'):
30
+ start_time = time.time()
31
+ proba = text2toxicity(user_text_input, True)
32
+ end_time = time.time()
33
+ prediction_time = end_time - start_time
34
+
35
+ if proba >= 0.5:
36
+ st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
37
+ st.image('Data/maxresdefault.jpg')
38
+ else:
39
+ st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
40
+ st.image('Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg')
41
+ st.write(f'Время предсказания: {prediction_time:.4f} секунд')
42
+
43
+ st.markdown("<h3 style='font-size: 18px;'>Ссылка на Токсичный бот</h3>", unsafe_allow_html=True)
44
+ st.markdown("[Токсичный бот](https://t.me/toxic1101_bot)")
Weights/BERTmodel_weights2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
3
+ size 116986906
Weights/cat_model4.cbm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
3
+ size 1135408
Weights/final_model_bah.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
3
+ size 1506113
Weights/tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
3
+ size 1750676
Weights/vocab_to_int.json ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import requests
4
+ import time
5
+ import numpy as np
6
+ import os
7
+ from Models.toxic1 import toxicity_page
8
+ from Models.strim_nlp import classic_ml_page
9
+ from Models.lstm import lstm_model_page
10
+ from Models.bert_strim import bert_model_page
11
+ import base64
12
+ import pandas as pd
13
+
14
+ background_image = 'Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png'
15
+ st.markdown(
16
+ f"""
17
+ <style>
18
+ .reportview-container {{
19
+ background: url(data:image/jpeg;base64,{base64.b64encode(open(background_image, "rb").read()).decode()});
20
+ background-size: cover;
21
+ }}
22
+ </style>
23
+ """, unsafe_allow_html=True
24
+ )
25
+ def app_description_page():
26
+ st.title("Welcome to My App!")
27
+ st.markdown("<h3 style='font-size: 18px;'>This is a Streamlit application where you can explore four different models.</h3>", unsafe_allow_html=True)
28
+ st.markdown("<h3 style='font-size: 18px;'>About the project:</h3>", unsafe_allow_html=True)
29
+ st.markdown("<h3 style='font-size: 18px;'>The task is to train 3 different models on a dataset that contains reviews about the clinic.</h3>", unsafe_allow_html=True)
30
+ st.markdown("<h3 style='font-size: 18px;'>You can write text and the model will classify it as “Negative” or “Positive”</h3>", unsafe_allow_html=True)
31
+ data = {
32
+ "Model": ["CatBoostClassifier", "LSTM", "Rubert-tiny2", "Rubert-tiny-toxicity"],
33
+ "F1 metric": [0.87, 0.94, 0.90, 0.84]
34
+ }
35
+ df = pd.DataFrame(data)
36
+ st.markdown("<h3 style='font-size: 18px;'>Models:</h3>", unsafe_allow_html=True)
37
+ st.markdown("<h3 style='font-size: 18px;'>1. CatBoostClassifier trained on TF-IDF </h3>", unsafe_allow_html=True)
38
+ st.markdown("<h3 style='font-size: 18px;'>2. LSTM with BahdanauAttention </h3>", unsafe_allow_html=True)
39
+ st.markdown("<h3 style='font-size: 18px;'>3. Rubert-tiny2 </h3>", unsafe_allow_html=True)
40
+ st.markdown("<h3 style='font-size: 18px;'>4. Rubert-tiny-toxicity </h3>", unsafe_allow_html=True)
41
+ st.dataframe(df)
42
+ st.image('20182704132259.jpg', use_column_width=True)
43
+
44
+ def model_selection_page():
45
+ st.sidebar.title("Model Selection")
46
+ selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
47
+
48
+ if selected_model == "Classic ML":
49
+ classic_ml_page()
50
+ st.write("You selected Classic ML.")
51
+ elif selected_model == "LSTM":
52
+ lstm_model_page()
53
+ st.write("You selected LSTM.")
54
+ elif selected_model == "BERT":
55
+ bert_model_page()
56
+ st.write("You selected BERT.")
57
+
58
+ def main():
59
+ page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
60
+
61
+ if page == "App Description":
62
+ app_description_page()
63
+ elif page == "Model Selection":
64
+ model_selection_page()
65
+ elif page == "Toxicity Model":
66
+ toxicity_page()
67
+
68
+ if __name__ == "__main__":
69
+ main()
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cachetools
2
+ catboost
3
+ charset-normalizer
4
+ cycler
5
+ gensim
6
+ GitPython
7
+ graphviz
8
+ huggingface-hub
9
+ joblib
10
+ markdown-it-py
11
+ networkx
12
+ nltk
13
+ numpy
14
+ pandas
15
+ pillow
16
+ requests
17
+ scikit-learn
18
+ streamlit
19
+ sympy
20
+ torch
21
+ tqdm
22
+ transformers
23
+ pymystem3
24
+ scipy==1.10.1