osheina commited on
Commit
f987f4c
·
verified ·
1 Parent(s): 053f659

Upload 16 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
37
+ healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
38
+ healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
BERTmodel_weights2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
3
+ size 116986906
bert_file.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel
2
+ from torch import nn
3
+
4
+ class BERTClassifier(nn.Module):
5
+ def __init__(self, bert_path="cointegrated/rubert-tiny2"):
6
+ super().__init__()
7
+ self.bert = AutoModel.from_pretrained(bert_path)
8
+ for param in self.bert.parameters():
9
+ param.requires_grad = False
10
+ self.linear = nn.Sequential(
11
+ nn.Linear(312, 150),
12
+ nn.Dropout(0.1),
13
+ nn.ReLU(),
14
+ nn.Linear(150, 1),
15
+ nn.Sigmoid()
16
+ )
17
+
18
+ def forward(self, x, masks):
19
+ bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
20
+ out = self.linear(bert_out)
21
+ return out
bert_strim.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ from bert_file import BERTClassifier
5
+ import numpy as np
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
8
+ model = BERTClassifier()
9
+ device = 'cpu'
10
+
11
+ model.load_state_dict(torch.load('/Users/olgaseina/Desktop/NLP_project/BERTmodel_weights2.pth',map_location=torch.device('cpu')))
12
+ model.eval()
13
+
14
+ @st.cache_data
15
+ def predict_sentiment(text):
16
+ MAX_LEN = 100
17
+ encoded_review = tokenizer.encode_plus(
18
+ text,
19
+ max_length=MAX_LEN,
20
+ add_special_tokens=True,
21
+ return_token_type_ids=False,
22
+ pad_to_max_length=True,
23
+ return_attention_mask=True,
24
+ return_tensors='pt',
25
+ )
26
+ input_ids = encoded_review['input_ids'].to(device)
27
+ attention_mask = encoded_review['attention_mask'].to(device)
28
+
29
+ with torch.no_grad():
30
+ output = model(input_ids, attention_mask)
31
+ prediction = torch.round(output).cpu().numpy()[0][0]
32
+ if prediction == 1:
33
+ return "Позитивный отзыв 😀"
34
+ else:
35
+ return "Негативный отзыв 😟"
36
+
37
+ def bert_model_page():
38
+ st.title("Классификатор отзывов")
39
+ user_input = st.text_area("Введите отзыв:")
40
+ if st.button("Классифицировать"):
41
+ if user_input:
42
+ prediction = predict_sentiment(user_input)
43
+ st.write(prediction)
44
+ else:
45
+ st.write("Пожалуйста, введите отзыв для классификации.")
cat_model4.cbm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
3
+ size 1135408
common_file.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import requests
4
+ from PIL import Image
5
+ from io import BytesIO
6
+ import time
7
+ import numpy as np
8
+ import os
9
+ from toxic1 import toxicity_page
10
+ from strim_nlp import classic_ml_page
11
+ from lstm import lstm_model_page
12
+ from bert_strim import bert_model_page
13
+
14
+ # Определение первой страницы с общим описанием приложения
15
+ def app_description_page():
16
+ st.title("Welcome to My App!")
17
+ st.write("This is a Streamlit application where you can explore two different models.")
18
+
19
+ # Определение второй страницы с обеими моделями и выбором между ними
20
+ def model_selection_page():
21
+ st.sidebar.title("Model Selection")
22
+ selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
23
+
24
+
25
+ # Depending on the model selected, display different information or use different models
26
+ if selected_model == "Classic ML":
27
+ classic_ml_page()# Здесь можно добавить код для загрузки и использования первой модели
28
+ st.write("You selected Classic ML.")
29
+ elif selected_model == "LSTM":
30
+ lstm_model_page()# Здесь можно добавить код для загрузки и использования второй модели
31
+ st.write("You selected LSTM.")
32
+ elif selected_model == "BERT":
33
+ bert_model_page()
34
+ # Здесь можно добавить код для загрузки и использования третьей модели
35
+ st.write("You selected BERT.")
36
+
37
+ # Add other components for review prediction here if needed
38
+
39
+ # Определение главной функции
40
+ def main():
41
+ # st.sidebar.title("Navigation") # You can remove or comment out this line since the sidebar title is set in model_selection_page now
42
+ page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
43
+
44
+ if page == "App Description":
45
+ app_description_page()
46
+ elif page == "Model Selection":
47
+ model_selection_page()
48
+ elif page == "Toxicity Model":
49
+ toxicity_page() # Call the function from toxic.py
50
+
51
+
52
+ # Запуск главной функции
53
+ if __name__ == "__main__":
54
+ main()
final_model_bah.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
3
+ size 1506113
healthcare_facilities_reviews.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
3
+ size 79002044
healthcare_facilities_reviews.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
3
+ size 95300708
lstm.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import re
4
+ import json
5
+ from nltk.corpus import stopwords
6
+ from model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
7
+ from nltk.corpus import stopwords
8
+ stop_words = set(stopwords.words('russian'))
9
+
10
+ # Load vocabulary mapping
11
+ with open('/Users/olgaseina/ds-phase-2/10-nlp/vocab_to_int.json', 'r') as file:
12
+ vocab_to_int = json.load(file)
13
+
14
+ # Load the pre-trained model
15
+ SEQ_LEN = 96
16
+ model_bah = LSTMBahdanauAttention()
17
+ # Set the new vocabulary size in the model
18
+ model_bah.load_state_dict(torch.load('/Users/olgaseina/ds-phase-2/10-nlp/final_model_bah.pth'))
19
+ model_bah.eval()
20
+
21
+ # Function to analyze sentiment
22
+ def analyze_sentiment(text):
23
+ preprocessed_text = data_preprocessing(text)
24
+ sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
25
+
26
+ with torch.no_grad():
27
+ probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
28
+ return probability
29
+
30
+ # Streamlit UI
31
+ def lstm_model_page():
32
+ st.title("Классификация отзывов лечебных учреждений")
33
+ user_input = st.text_area("Введите ваш отзыв:")
34
+ if st.button("Классифицировать"):
35
+ probability = analyze_sentiment(user_input)
36
+ if probability > 0.5:
37
+ st.write("Отзыв положительный 🌟")
38
+ else:
39
+ st.write("Отзыв отрицательный 😞")
model_file.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('russian'))
8
+ from collections import Counter
9
+ from gensim.models import Word2Vec
10
+ import pandas as pd
11
+ import torch.nn.functional as F
12
+
13
+
14
+ HIDDEN_SIZE = 32
15
+ SEQ_LEN = 32
16
+ df = pd.read_json('/Users/olgaseina/ds-phase-2/10-nlp/data/tg_channels/healthcare_facilities_reviews.jsonl', lines=True)
17
+
18
+ def data_preprocessing(text: str) -> str:
19
+ text = text.lower()
20
+ text = re.sub('<.*?>', '', text) # html tags
21
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
22
+ text = ' '.join([word for word in text.split() if word not in stop_words])
23
+ text = [word for word in text.split() if not word.isdigit()]
24
+ text = ' '.join(text)
25
+ return text
26
+
27
+ contents = df['content'].tolist()
28
+ preprocessed = [data_preprocessing(content) for content in contents]
29
+
30
+ corpus = [word for text in preprocessed for word in text.split()]
31
+ sorted_words = Counter(corpus).most_common()
32
+
33
+ def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
34
+ return list(filter(lambda x: x[1] > n, sorted_words))
35
+
36
+ sorted_words = get_words_by_freq(sorted_words, 100)
37
+ sorted_words[-10:]
38
+
39
+ vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
40
+
41
+ reviews_int = []
42
+ for text in preprocessed:
43
+ r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
44
+ reviews_int.append(r)
45
+
46
+ w2v_input = []
47
+ for review in preprocessed:
48
+ cur_review = []
49
+ for word in review.split():
50
+ if vocab_to_int.get(word):
51
+ cur_review.append(word)
52
+ w2v_input.append(cur_review)
53
+
54
+ VOCAB_SIZE = len(vocab_to_int) + 1
55
+
56
+ EMBEDDING_DIM = 64
57
+
58
+ wv = Word2Vec(
59
+ min_count=1, # минимальная встречаемость в корпусе
60
+ vector_size=EMBEDDING_DIM # размерность вектора для слова
61
+ )
62
+ wv.build_vocab(w2v_input)
63
+
64
+ wv.train(
65
+ corpus_iterable=w2v_input,
66
+ total_examples=wv.corpus_count,
67
+ epochs=10
68
+ )
69
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
70
+
71
+ # Бежим по всем словам словаря: если слово есть, достаем его вектор
72
+ # если слова нет, то распечатываем его и пропускаем
73
+ for word, i in vocab_to_int.items():
74
+ try:
75
+ embedding_vector = wv.wv[word]
76
+ embedding_matrix[i] = embedding_vector
77
+ except KeyError as e:
78
+ pass
79
+ print(f'{e}: word: {word}')
80
+
81
+ # Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
82
+ embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
83
+
84
+ def data_preprocessing(text: str) -> str:
85
+ text = text.lower()
86
+ text = re.sub('<.*?>', '', text) # html tags
87
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
88
+ text = ' '.join([word for word in text.split() if word not in stop_words])
89
+ text = [word for word in text.split() if not word.isdigit()]
90
+ text = ' '.join(text)
91
+ return text
92
+
93
+
94
+
95
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
96
+ features = np.zeros((len(review_int), seq_len), dtype = int)
97
+ for i, review in enumerate(review_int):
98
+ if len(review) <= seq_len:
99
+ zeros = list(np.zeros(seq_len - len(review)))
100
+ new = zeros + review
101
+ else:
102
+ new = review[: seq_len]
103
+ features[i, :] = np.array(new)
104
+
105
+ return features
106
+
107
+ def preprocess_single_string(
108
+ input_string: str,
109
+ seq_len: int,
110
+ vocab_to_int: dict,
111
+ verbose : bool = False
112
+ ) -> torch.tensor:
113
+ preprocessed_string = data_preprocessing(input_string)
114
+ result_list = []
115
+ for word in preprocessed_string.split():
116
+ try:
117
+ result_list.append(vocab_to_int[word])
118
+ except KeyError as e:
119
+ if verbose:
120
+ print(f'{e}: not in dictionary!')
121
+ pass
122
+ result_padded = padding([result_list], seq_len)[0]
123
+
124
+ return torch.tensor(result_padded)
125
+
126
+ class BahdanauAttention(nn.Module):
127
+ def __init__(
128
+ self,
129
+ hidden_size: int = HIDDEN_SIZE
130
+ ) -> None:
131
+
132
+ super().__init__()
133
+ self.hidden_size = hidden_size
134
+ self.W = nn.Linear(hidden_size, hidden_size)
135
+ self.U = nn.Linear(hidden_size, hidden_size)
136
+ self.V = nn.Linear(hidden_size, 1)
137
+ self.tanh = nn.Tanh()
138
+
139
+ def forward(
140
+ self,
141
+ keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
142
+ query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
143
+ ):
144
+
145
+ query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
146
+ r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
147
+
148
+ r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
149
+
150
+ scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
151
+ scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
152
+ att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
153
+ context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
154
+ return context, att_weights
155
+
156
+ class LSTMBahdanauAttention(nn.Module):
157
+ def __init__(self) -> None:
158
+ super().__init__()
159
+
160
+ # self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
161
+ self.embedding = embedding_layer
162
+ self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
163
+ self.attn = BahdanauAttention(HIDDEN_SIZE)
164
+ self.clf = nn.Sequential(
165
+ nn.Linear(HIDDEN_SIZE, 128),
166
+ nn.Dropout(),
167
+ nn.Tanh(),
168
+ nn.Linear(128, 1)
169
+ )
170
+
171
+ def forward(self, x):
172
+ embeddings = self.embedding(x)
173
+ outputs, (h_n, _) = self.lstm(embeddings)
174
+ context, att_weights = self.attn(outputs, h_n.squeeze(0))
175
+ out = self.clf(context)
176
+ return out, att_weights
requirements.txt ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiogram==3.4.1
3
+ aiohttp==3.9.3
4
+ aiosignal==1.3.1
5
+ alembic==1.13.1
6
+ altair==5.2.0
7
+ annotated-types==0.6.0
8
+ anyio==4.2.0
9
+ appdirs==1.4.4
10
+ appnope==0.1.4
11
+ argon2-cffi==23.1.0
12
+ argon2-cffi-bindings==21.2.0
13
+ arrow==1.3.0
14
+ asttokens==2.4.1
15
+ async-lru==2.0.4
16
+ attrs==23.2.0
17
+ Babel==2.14.0
18
+ beautifulsoup4==4.12.3
19
+ bleach==6.1.0
20
+ blinker==1.7.0
21
+ branca==0.7.1
22
+ cachetools==5.3.2
23
+ catboost==1.2.3
24
+ category-encoders==2.6.3
25
+ certifi==2024.2.2
26
+ cffi==1.16.0
27
+ charset-normalizer==3.3.2
28
+ click==8.1.7
29
+ cloudpickle==3.0.0
30
+ cmdstanpy==1.2.1
31
+ colorlog==6.8.2
32
+ comm==0.2.2
33
+ contourpy==1.2.0
34
+ cycler==0.12.1
35
+ dataparser==0.0.2
36
+ debugpy==1.8.1
37
+ decorator==5.1.1
38
+ defusedxml==0.7.1
39
+ distlib==0.3.8
40
+ et-xmlfile==1.1.0
41
+ executing==2.0.1
42
+ fastjsonschema==2.19.1
43
+ filelock==3.13.1
44
+ folium==0.15.1
45
+ fonttools==4.48.1
46
+ fqdn==1.5.1
47
+ frozendict==2.4.0
48
+ frozenlist==1.4.1
49
+ fsspec==2024.3.1
50
+ gensim==4.3.2
51
+ gitdb==4.0.11
52
+ GitPython==3.1.42
53
+ graphviz==0.20.1
54
+ greenlet==3.0.3
55
+ h11==0.14.0
56
+ holidays==0.45
57
+ html5lib==1.1
58
+ httpcore==1.0.2
59
+ httpx==0.26.0
60
+ huggingface-hub==0.22.2
61
+ idna==3.6
62
+ imageio==2.34.0
63
+ importlib-metadata==7.0.1
64
+ importlib_resources==6.4.0
65
+ ipykernel==6.29.4
66
+ ipython==8.23.0
67
+ isoduration==20.11.0
68
+ jedi==0.19.1
69
+ Jinja2==3.1.3
70
+ joblib==1.3.2
71
+ json5==0.9.14
72
+ jsonpointer==2.4
73
+ jsonschema==4.21.1
74
+ jsonschema-specifications==2023.12.1
75
+ jupyter-events==0.9.0
76
+ jupyter-lsp==2.2.2
77
+ jupyter_client==8.6.1
78
+ jupyter_core==5.7.2
79
+ jupyter_server==2.12.5
80
+ jupyter_server_terminals==0.5.2
81
+ jupyterlab==4.1.0
82
+ jupyterlab_pygments==0.3.0
83
+ jupyterlab_server==2.25.2
84
+ kiwisolver==1.4.5
85
+ lazy_loader==0.3
86
+ lightgbm==4.3.0
87
+ llvmlite==0.42.0
88
+ lxml==5.1.0
89
+ magic-filter==1.0.12
90
+ Mako==1.3.2
91
+ markdown-it-py==3.0.0
92
+ MarkupSafe==2.1.5
93
+ matplotlib==3.8.3
94
+ matplotlib-inline==0.1.6
95
+ mdurl==0.1.2
96
+ mistune==3.0.2
97
+ mpmath==1.3.0
98
+ multidict==6.0.5
99
+ multitasking==0.0.11
100
+ nbclient==0.9.0
101
+ nbconvert==7.15.0
102
+ nbformat==5.9.2
103
+ nest-asyncio==1.6.0
104
+ networkx==3.2.1
105
+ nltk==3.8.1
106
+ notebook_shim==0.2.3
107
+ numba==0.59.0
108
+ numpy==1.26.4
109
+ opencv-python==4.9.0.80
110
+ openpyxl==3.1.2
111
+ optuna==3.5.0
112
+ overrides==7.7.0
113
+ packaging==24.0
114
+ pandas==2.2.0
115
+ pandocfilters==1.5.1
116
+ parso==0.8.4
117
+ patsy==0.5.6
118
+ peewee==3.17.1
119
+ pexpect==4.9.0
120
+ pillow==10.2.0
121
+ platformdirs==4.2.0
122
+ plotly==5.19.0
123
+ prometheus-client==0.19.0
124
+ prompt-toolkit==3.0.43
125
+ prophet==1.1.5
126
+ protobuf==4.25.3
127
+ psutil==5.9.8
128
+ ptyprocess==0.7.0
129
+ pure-eval==0.2.2
130
+ py-cpuinfo==9.0.0
131
+ pyarrow==15.0.0
132
+ pycparser==2.21
133
+ pydantic==2.5.3
134
+ pydantic_core==2.14.6
135
+ pydeck==0.8.1b0
136
+ pyenchant==3.2.2
137
+ Pygments==2.17.2
138
+ pymystem3==0.2.0
139
+ pynndescent==0.5.11
140
+ pyparsing==3.1.1
141
+ python-dateutil==2.9.0.post0
142
+ python-json-logger==2.0.7
143
+ pytz==2024.1
144
+ PyYAML==6.0.1
145
+ pyzmq==25.1.2
146
+ referencing==0.33.0
147
+ regex==2023.12.25
148
+ requests==2.31.0
149
+ rfc3339-validator==0.1.4
150
+ rfc3986-validator==0.1.1
151
+ rich==13.7.0
152
+ rpds-py==0.17.1
153
+ safetensors==0.4.2
154
+ scikit-image==0.22.0
155
+ scikit-learn==1.4.0
156
+ scipy==1.12.0
157
+ seaborn==0.13.2
158
+ Send2Trash==1.8.2
159
+ setuptools==69.2.0
160
+ six==1.16.0
161
+ smart-open==7.0.4
162
+ smmap==5.0.1
163
+ sniffio==1.3.0
164
+ soupsieve==2.5
165
+ SQLAlchemy==2.0.28
166
+ stack-data==0.6.3
167
+ stanio==0.3.0
168
+ statsmodels==0.14.1
169
+ streamlit==1.31.1
170
+ stumpy==1.12.0
171
+ sympy==1.12
172
+ tenacity==8.2.3
173
+ terminado==0.18.0
174
+ thop==0.1.1.post2209072238
175
+ threadpoolctl==3.2.0
176
+ tifffile==2024.2.12
177
+ tinycss2==1.2.1
178
+ tokenizers==0.15.2
179
+ toml==0.10.2
180
+ toolz==0.12.1
181
+ torch==2.2.2
182
+ torchaudio==2.2.2
183
+ torchvision==0.17.2
184
+ tornado==6.4
185
+ tqdm==4.66.2
186
+ traitlets==5.14.2
187
+ transformers==4.39.3
188
+ translit==0.2a1
189
+ transliterate==1.10.2
190
+ tsfresh==0.20.2
191
+ types-python-dateutil==2.8.19.20240106
192
+ typing_extensions==4.9.0
193
+ tzdata==2024.1
194
+ tzlocal==5.2
195
+ ultralytics==8.1.42
196
+ umap==0.1.1
197
+ umap-learn==0.5.5
198
+ uri-template==1.3.0
199
+ urllib3==2.2.0
200
+ validators==0.22.0
201
+ virtualenv==20.25.1
202
+ wcwidth==0.2.13
203
+ webcolors==1.13
204
+ webencodings==0.5.1
205
+ websocket-client==1.7.0
206
+ wrapt==1.16.0
207
+ xgboost==2.0.3
208
+ xlrd==2.0.1
209
+ xyzservices==2023.10.1
210
+ yarl==1.9.4
211
+ yellowbrick==1.5
212
+ yfinance==0.2.36
213
+ zipp==3.17.0
rnn_preprocessing.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+
6
+ from nltk.corpus import stopwords
7
+ stop_words = set(stopwords.words('russian'))
8
+
9
+ def data_preprocessing(text: str) -> str:
10
+ """preprocessing string: lowercase, removing html-tags, punctuation,
11
+ stopwords, digits
12
+
13
+ Args:
14
+ text (str): input string for preprocessing
15
+
16
+ Returns:
17
+ str: preprocessed string
18
+ """
19
+
20
+ text = text.lower()
21
+ text = re.sub('<.*?>', '', text) # html tags
22
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
23
+ text = ' '.join([word for word in text.split() if word not in stop_words])
24
+ text = [word for word in text.split() if not word.isdigit()]
25
+ text = ' '.join(text)
26
+ return text
27
+
28
+ def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
29
+ return list(filter(lambda x: x[1] > n, sorted_words))
30
+
31
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
32
+ """Make left-sided padding for input list of tokens
33
+
34
+ Args:
35
+ review_int (list): input list of tokens
36
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
37
+
38
+ Returns:
39
+ np.array: padded sequences
40
+ """
41
+ features = np.zeros((len(review_int), seq_len), dtype = int)
42
+ for i, review in enumerate(review_int):
43
+ if len(review) <= seq_len:
44
+ zeros = list(np.zeros(seq_len - len(review)))
45
+ new = zeros + review
46
+ else:
47
+ new = review[: seq_len]
48
+ features[i, :] = np.array(new)
49
+
50
+ return features
51
+
52
+ def preprocess_single_string(
53
+ input_string: str,
54
+ seq_len: int,
55
+ vocab_to_int: dict,
56
+ verbose : bool = False
57
+ ) -> torch.tensor:
58
+ """Function for all preprocessing steps on a single string
59
+
60
+ Args:
61
+ input_string (str): input single string for preprocessing
62
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
63
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
64
+
65
+ Returns:
66
+ list: preprocessed string
67
+ """
68
+
69
+ preprocessed_string = data_preprocessing(input_string)
70
+ result_list = []
71
+ for word in preprocessed_string.split():
72
+ try:
73
+ result_list.append(vocab_to_int[word])
74
+ except KeyError as e:
75
+ if verbose:
76
+ print(f'{e}: not in dictionary!')
77
+ pass
78
+ result_padded = padding([result_list], seq_len)[0]
79
+
80
+ return torch.tensor(result_padded)
strim_nlp.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from catboost import CatBoostClassifier
4
+ import re
5
+ import string
6
+ from nltk.corpus import stopwords
7
+ from pymystem3 import Mystem
8
+ from joblib import load
9
+ import nltk
10
+
11
+ def data_preprocessing(text):
12
+ stop_words = set(stopwords.words('russian'))
13
+ text = text.lower()
14
+ text = re.sub("<.*?>", "", text)
15
+ text = re.sub(r'http\S+', " ", text)
16
+ text = re.sub(r'@\w+', ' ', text)
17
+ text = re.sub(r'#\w+', ' ', text)
18
+ text = re.sub(r'\d+', ' ', text)
19
+ text = "".join([c for c in text if c not in string.punctuation])
20
+ return " ".join([word for word in text.split() if word not in stop_words])
21
+
22
+ def lemmatize_text(text):
23
+ mystem = Mystem()
24
+ lemmas = mystem.lemmatize(text)
25
+ return ' '.join(lemmas)
26
+
27
+ model = CatBoostClassifier()
28
+ model.load_model('/Users/olgaseina/Desktop/NLP_project/cat_model4.cbm')
29
+
30
+ tfidf_vectorizer = load('/Users/olgaseina/Desktop/NLP_project/tfidf_vectorizer.joblib')
31
+
32
+ def classic_ml_page():
33
+ st.title("Классификация отзывов о медицинских учреждениях")
34
+ user_review = st.text_area("Введите ваш отзыв здесь:")
35
+
36
+ if st.button("Классифицировать"):
37
+ if user_review:
38
+ preprocessed_review = data_preprocessing(user_review)
39
+ lemmatized_review = lemmatize_text(preprocessed_review)
40
+ vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
41
+ prediction = model.predict(vectorized_review)
42
+
43
+ if prediction[0] == 1:
44
+ st.write("Позитивный отзыв 😀")
45
+ else:
46
+ st.write("Негативный отзыв 😟")
47
+ else:
48
+ st.write("Пожалуйста, введите отзыв для классификации.")
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
3
+ size 1750676
toxic1.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # toxic.py
2
+ import streamlit as st
3
+ import numpy as np
4
+ import pandas as pd
5
+ import time
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+ # Ensure your model and tokenizer paths are correct and accessible by the Streamlit app.
10
+ # Since you're importing this into another file, relative or absolute paths might need to be updated accordingly.
11
+ model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
12
+ tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
13
+ model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
14
+
15
+ def text2toxicity(text, aggregate=True):
16
+ with torch.no_grad():
17
+ inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
18
+ proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
19
+ if isinstance(text, str):
20
+ proba = proba[0]
21
+ if aggregate:
22
+ return 1 - proba.T[0] * (1 - proba.T[-1])
23
+ return proba
24
+
25
+ def toxicity_page():
26
+ st.title("""
27
+ Определим токсичный комментарий или нет
28
+ """)
29
+
30
+ user_text_input = st.text_area('Введите ваш отзыв здесь:')
31
+
32
+ if st.button('Предсказать'):
33
+ start_time = time.time()
34
+ proba = text2toxicity(user_text_input, True)
35
+ end_time = time.time()
36
+ prediction_time = end_time - start_time
37
+
38
+ if proba >= 0.5:
39
+ st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
40
+ else:
41
+ st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
42
+ st.write(f'Время предсказания: {prediction_time:.4f} секунд')
vocab_to_int.json ADDED
The diff for this file is too large to render. See raw diff