Spaces:
Sleeping
Sleeping
Upload 16 files
Browse files- .gitattributes +3 -0
- BERTmodel_weights2.pth +3 -0
- bert_file.py +21 -0
- bert_strim.py +45 -0
- cat_model4.cbm +3 -0
- common_file.py +54 -0
- final_model_bah.pth +3 -0
- healthcare_facilities_reviews.csv +3 -0
- healthcare_facilities_reviews.jsonl +3 -0
- lstm.py +39 -0
- model_file.py +176 -0
- requirements.txt +213 -0
- rnn_preprocessing.py +80 -0
- strim_nlp.py +58 -0
- tfidf_vectorizer.joblib +3 -0
- toxic1.py +42 -0
- vocab_to_int.json +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
|
37 |
+
healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
|
BERTmodel_weights2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
|
3 |
+
size 116986906
|
bert_file.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModel
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
class BERTClassifier(nn.Module):
|
5 |
+
def __init__(self, bert_path="cointegrated/rubert-tiny2"):
|
6 |
+
super().__init__()
|
7 |
+
self.bert = AutoModel.from_pretrained(bert_path)
|
8 |
+
for param in self.bert.parameters():
|
9 |
+
param.requires_grad = False
|
10 |
+
self.linear = nn.Sequential(
|
11 |
+
nn.Linear(312, 150),
|
12 |
+
nn.Dropout(0.1),
|
13 |
+
nn.ReLU(),
|
14 |
+
nn.Linear(150, 1),
|
15 |
+
nn.Sigmoid()
|
16 |
+
)
|
17 |
+
|
18 |
+
def forward(self, x, masks):
|
19 |
+
bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
|
20 |
+
out = self.linear(bert_out)
|
21 |
+
return out
|
bert_strim.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
import torch
|
4 |
+
from bert_file import BERTClassifier
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
8 |
+
model = BERTClassifier()
|
9 |
+
device = 'cpu'
|
10 |
+
|
11 |
+
model.load_state_dict(torch.load('/Users/olgaseina/Desktop/NLP_project/BERTmodel_weights2.pth',map_location=torch.device('cpu')))
|
12 |
+
model.eval()
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def predict_sentiment(text):
|
16 |
+
MAX_LEN = 100
|
17 |
+
encoded_review = tokenizer.encode_plus(
|
18 |
+
text,
|
19 |
+
max_length=MAX_LEN,
|
20 |
+
add_special_tokens=True,
|
21 |
+
return_token_type_ids=False,
|
22 |
+
pad_to_max_length=True,
|
23 |
+
return_attention_mask=True,
|
24 |
+
return_tensors='pt',
|
25 |
+
)
|
26 |
+
input_ids = encoded_review['input_ids'].to(device)
|
27 |
+
attention_mask = encoded_review['attention_mask'].to(device)
|
28 |
+
|
29 |
+
with torch.no_grad():
|
30 |
+
output = model(input_ids, attention_mask)
|
31 |
+
prediction = torch.round(output).cpu().numpy()[0][0]
|
32 |
+
if prediction == 1:
|
33 |
+
return "Позитивный отзыв 😀"
|
34 |
+
else:
|
35 |
+
return "Негативный отзыв 😟"
|
36 |
+
|
37 |
+
def bert_model_page():
|
38 |
+
st.title("Классификатор отзывов")
|
39 |
+
user_input = st.text_area("Введите отзыв:")
|
40 |
+
if st.button("Классифицировать"):
|
41 |
+
if user_input:
|
42 |
+
prediction = predict_sentiment(user_input)
|
43 |
+
st.write(prediction)
|
44 |
+
else:
|
45 |
+
st.write("Пожалуйста, введите отзыв для классификации.")
|
cat_model4.cbm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
|
3 |
+
size 1135408
|
common_file.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import requests
|
4 |
+
from PIL import Image
|
5 |
+
from io import BytesIO
|
6 |
+
import time
|
7 |
+
import numpy as np
|
8 |
+
import os
|
9 |
+
from toxic1 import toxicity_page
|
10 |
+
from strim_nlp import classic_ml_page
|
11 |
+
from lstm import lstm_model_page
|
12 |
+
from bert_strim import bert_model_page
|
13 |
+
|
14 |
+
# Определение первой страницы с общим описанием приложения
|
15 |
+
def app_description_page():
|
16 |
+
st.title("Welcome to My App!")
|
17 |
+
st.write("This is a Streamlit application where you can explore two different models.")
|
18 |
+
|
19 |
+
# Определение второй страницы с обеими моделями и выбором между ними
|
20 |
+
def model_selection_page():
|
21 |
+
st.sidebar.title("Model Selection")
|
22 |
+
selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
|
23 |
+
|
24 |
+
|
25 |
+
# Depending on the model selected, display different information or use different models
|
26 |
+
if selected_model == "Classic ML":
|
27 |
+
classic_ml_page()# Здесь можно добавить код для загрузки и использования первой модели
|
28 |
+
st.write("You selected Classic ML.")
|
29 |
+
elif selected_model == "LSTM":
|
30 |
+
lstm_model_page()# Здесь можно добавить код для загрузки и использования второй модели
|
31 |
+
st.write("You selected LSTM.")
|
32 |
+
elif selected_model == "BERT":
|
33 |
+
bert_model_page()
|
34 |
+
# Здесь можно добавить код для загрузки и использования третьей модели
|
35 |
+
st.write("You selected BERT.")
|
36 |
+
|
37 |
+
# Add other components for review prediction here if needed
|
38 |
+
|
39 |
+
# Определение главной функции
|
40 |
+
def main():
|
41 |
+
# st.sidebar.title("Navigation") # You can remove or comment out this line since the sidebar title is set in model_selection_page now
|
42 |
+
page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
|
43 |
+
|
44 |
+
if page == "App Description":
|
45 |
+
app_description_page()
|
46 |
+
elif page == "Model Selection":
|
47 |
+
model_selection_page()
|
48 |
+
elif page == "Toxicity Model":
|
49 |
+
toxicity_page() # Call the function from toxic.py
|
50 |
+
|
51 |
+
|
52 |
+
# Запуск главной функции
|
53 |
+
if __name__ == "__main__":
|
54 |
+
main()
|
final_model_bah.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
|
3 |
+
size 1506113
|
healthcare_facilities_reviews.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
|
3 |
+
size 79002044
|
healthcare_facilities_reviews.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
|
3 |
+
size 95300708
|
lstm.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import re
|
4 |
+
import json
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
stop_words = set(stopwords.words('russian'))
|
9 |
+
|
10 |
+
# Load vocabulary mapping
|
11 |
+
with open('/Users/olgaseina/ds-phase-2/10-nlp/vocab_to_int.json', 'r') as file:
|
12 |
+
vocab_to_int = json.load(file)
|
13 |
+
|
14 |
+
# Load the pre-trained model
|
15 |
+
SEQ_LEN = 96
|
16 |
+
model_bah = LSTMBahdanauAttention()
|
17 |
+
# Set the new vocabulary size in the model
|
18 |
+
model_bah.load_state_dict(torch.load('/Users/olgaseina/ds-phase-2/10-nlp/final_model_bah.pth'))
|
19 |
+
model_bah.eval()
|
20 |
+
|
21 |
+
# Function to analyze sentiment
|
22 |
+
def analyze_sentiment(text):
|
23 |
+
preprocessed_text = data_preprocessing(text)
|
24 |
+
sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
|
25 |
+
|
26 |
+
with torch.no_grad():
|
27 |
+
probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
|
28 |
+
return probability
|
29 |
+
|
30 |
+
# Streamlit UI
|
31 |
+
def lstm_model_page():
|
32 |
+
st.title("Классификация отзывов лечебных учреждений")
|
33 |
+
user_input = st.text_area("Введите ваш отзыв:")
|
34 |
+
if st.button("Классифицировать"):
|
35 |
+
probability = analyze_sentiment(user_input)
|
36 |
+
if probability > 0.5:
|
37 |
+
st.write("Отзыв положительный 🌟")
|
38 |
+
else:
|
39 |
+
st.write("Отзыв отрицательный 😞")
|
model_file.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
stop_words = set(stopwords.words('russian'))
|
8 |
+
from collections import Counter
|
9 |
+
from gensim.models import Word2Vec
|
10 |
+
import pandas as pd
|
11 |
+
import torch.nn.functional as F
|
12 |
+
|
13 |
+
|
14 |
+
HIDDEN_SIZE = 32
|
15 |
+
SEQ_LEN = 32
|
16 |
+
df = pd.read_json('/Users/olgaseina/ds-phase-2/10-nlp/data/tg_channels/healthcare_facilities_reviews.jsonl', lines=True)
|
17 |
+
|
18 |
+
def data_preprocessing(text: str) -> str:
|
19 |
+
text = text.lower()
|
20 |
+
text = re.sub('<.*?>', '', text) # html tags
|
21 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
22 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
23 |
+
text = [word for word in text.split() if not word.isdigit()]
|
24 |
+
text = ' '.join(text)
|
25 |
+
return text
|
26 |
+
|
27 |
+
contents = df['content'].tolist()
|
28 |
+
preprocessed = [data_preprocessing(content) for content in contents]
|
29 |
+
|
30 |
+
corpus = [word for text in preprocessed for word in text.split()]
|
31 |
+
sorted_words = Counter(corpus).most_common()
|
32 |
+
|
33 |
+
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
|
34 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
35 |
+
|
36 |
+
sorted_words = get_words_by_freq(sorted_words, 100)
|
37 |
+
sorted_words[-10:]
|
38 |
+
|
39 |
+
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
|
40 |
+
|
41 |
+
reviews_int = []
|
42 |
+
for text in preprocessed:
|
43 |
+
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
|
44 |
+
reviews_int.append(r)
|
45 |
+
|
46 |
+
w2v_input = []
|
47 |
+
for review in preprocessed:
|
48 |
+
cur_review = []
|
49 |
+
for word in review.split():
|
50 |
+
if vocab_to_int.get(word):
|
51 |
+
cur_review.append(word)
|
52 |
+
w2v_input.append(cur_review)
|
53 |
+
|
54 |
+
VOCAB_SIZE = len(vocab_to_int) + 1
|
55 |
+
|
56 |
+
EMBEDDING_DIM = 64
|
57 |
+
|
58 |
+
wv = Word2Vec(
|
59 |
+
min_count=1, # минимальная встречаемость в корпусе
|
60 |
+
vector_size=EMBEDDING_DIM # размерность вектора для слова
|
61 |
+
)
|
62 |
+
wv.build_vocab(w2v_input)
|
63 |
+
|
64 |
+
wv.train(
|
65 |
+
corpus_iterable=w2v_input,
|
66 |
+
total_examples=wv.corpus_count,
|
67 |
+
epochs=10
|
68 |
+
)
|
69 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
70 |
+
|
71 |
+
# Бежим по всем словам словаря: если слово есть, достаем его вектор
|
72 |
+
# если слова нет, то распечатываем его и пропускаем
|
73 |
+
for word, i in vocab_to_int.items():
|
74 |
+
try:
|
75 |
+
embedding_vector = wv.wv[word]
|
76 |
+
embedding_matrix[i] = embedding_vector
|
77 |
+
except KeyError as e:
|
78 |
+
pass
|
79 |
+
print(f'{e}: word: {word}')
|
80 |
+
|
81 |
+
# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
|
82 |
+
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
83 |
+
|
84 |
+
def data_preprocessing(text: str) -> str:
|
85 |
+
text = text.lower()
|
86 |
+
text = re.sub('<.*?>', '', text) # html tags
|
87 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
88 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
89 |
+
text = [word for word in text.split() if not word.isdigit()]
|
90 |
+
text = ' '.join(text)
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
96 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
97 |
+
for i, review in enumerate(review_int):
|
98 |
+
if len(review) <= seq_len:
|
99 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
100 |
+
new = zeros + review
|
101 |
+
else:
|
102 |
+
new = review[: seq_len]
|
103 |
+
features[i, :] = np.array(new)
|
104 |
+
|
105 |
+
return features
|
106 |
+
|
107 |
+
def preprocess_single_string(
|
108 |
+
input_string: str,
|
109 |
+
seq_len: int,
|
110 |
+
vocab_to_int: dict,
|
111 |
+
verbose : bool = False
|
112 |
+
) -> torch.tensor:
|
113 |
+
preprocessed_string = data_preprocessing(input_string)
|
114 |
+
result_list = []
|
115 |
+
for word in preprocessed_string.split():
|
116 |
+
try:
|
117 |
+
result_list.append(vocab_to_int[word])
|
118 |
+
except KeyError as e:
|
119 |
+
if verbose:
|
120 |
+
print(f'{e}: not in dictionary!')
|
121 |
+
pass
|
122 |
+
result_padded = padding([result_list], seq_len)[0]
|
123 |
+
|
124 |
+
return torch.tensor(result_padded)
|
125 |
+
|
126 |
+
class BahdanauAttention(nn.Module):
|
127 |
+
def __init__(
|
128 |
+
self,
|
129 |
+
hidden_size: int = HIDDEN_SIZE
|
130 |
+
) -> None:
|
131 |
+
|
132 |
+
super().__init__()
|
133 |
+
self.hidden_size = hidden_size
|
134 |
+
self.W = nn.Linear(hidden_size, hidden_size)
|
135 |
+
self.U = nn.Linear(hidden_size, hidden_size)
|
136 |
+
self.V = nn.Linear(hidden_size, 1)
|
137 |
+
self.tanh = nn.Tanh()
|
138 |
+
|
139 |
+
def forward(
|
140 |
+
self,
|
141 |
+
keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
142 |
+
query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
|
143 |
+
):
|
144 |
+
|
145 |
+
query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
|
146 |
+
r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
|
147 |
+
|
148 |
+
r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
149 |
+
|
150 |
+
scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
|
151 |
+
scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
|
152 |
+
att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
|
153 |
+
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
|
154 |
+
return context, att_weights
|
155 |
+
|
156 |
+
class LSTMBahdanauAttention(nn.Module):
|
157 |
+
def __init__(self) -> None:
|
158 |
+
super().__init__()
|
159 |
+
|
160 |
+
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
|
161 |
+
self.embedding = embedding_layer
|
162 |
+
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
|
163 |
+
self.attn = BahdanauAttention(HIDDEN_SIZE)
|
164 |
+
self.clf = nn.Sequential(
|
165 |
+
nn.Linear(HIDDEN_SIZE, 128),
|
166 |
+
nn.Dropout(),
|
167 |
+
nn.Tanh(),
|
168 |
+
nn.Linear(128, 1)
|
169 |
+
)
|
170 |
+
|
171 |
+
def forward(self, x):
|
172 |
+
embeddings = self.embedding(x)
|
173 |
+
outputs, (h_n, _) = self.lstm(embeddings)
|
174 |
+
context, att_weights = self.attn(outputs, h_n.squeeze(0))
|
175 |
+
out = self.clf(context)
|
176 |
+
return out, att_weights
|
requirements.txt
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiogram==3.4.1
|
3 |
+
aiohttp==3.9.3
|
4 |
+
aiosignal==1.3.1
|
5 |
+
alembic==1.13.1
|
6 |
+
altair==5.2.0
|
7 |
+
annotated-types==0.6.0
|
8 |
+
anyio==4.2.0
|
9 |
+
appdirs==1.4.4
|
10 |
+
appnope==0.1.4
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
argon2-cffi-bindings==21.2.0
|
13 |
+
arrow==1.3.0
|
14 |
+
asttokens==2.4.1
|
15 |
+
async-lru==2.0.4
|
16 |
+
attrs==23.2.0
|
17 |
+
Babel==2.14.0
|
18 |
+
beautifulsoup4==4.12.3
|
19 |
+
bleach==6.1.0
|
20 |
+
blinker==1.7.0
|
21 |
+
branca==0.7.1
|
22 |
+
cachetools==5.3.2
|
23 |
+
catboost==1.2.3
|
24 |
+
category-encoders==2.6.3
|
25 |
+
certifi==2024.2.2
|
26 |
+
cffi==1.16.0
|
27 |
+
charset-normalizer==3.3.2
|
28 |
+
click==8.1.7
|
29 |
+
cloudpickle==3.0.0
|
30 |
+
cmdstanpy==1.2.1
|
31 |
+
colorlog==6.8.2
|
32 |
+
comm==0.2.2
|
33 |
+
contourpy==1.2.0
|
34 |
+
cycler==0.12.1
|
35 |
+
dataparser==0.0.2
|
36 |
+
debugpy==1.8.1
|
37 |
+
decorator==5.1.1
|
38 |
+
defusedxml==0.7.1
|
39 |
+
distlib==0.3.8
|
40 |
+
et-xmlfile==1.1.0
|
41 |
+
executing==2.0.1
|
42 |
+
fastjsonschema==2.19.1
|
43 |
+
filelock==3.13.1
|
44 |
+
folium==0.15.1
|
45 |
+
fonttools==4.48.1
|
46 |
+
fqdn==1.5.1
|
47 |
+
frozendict==2.4.0
|
48 |
+
frozenlist==1.4.1
|
49 |
+
fsspec==2024.3.1
|
50 |
+
gensim==4.3.2
|
51 |
+
gitdb==4.0.11
|
52 |
+
GitPython==3.1.42
|
53 |
+
graphviz==0.20.1
|
54 |
+
greenlet==3.0.3
|
55 |
+
h11==0.14.0
|
56 |
+
holidays==0.45
|
57 |
+
html5lib==1.1
|
58 |
+
httpcore==1.0.2
|
59 |
+
httpx==0.26.0
|
60 |
+
huggingface-hub==0.22.2
|
61 |
+
idna==3.6
|
62 |
+
imageio==2.34.0
|
63 |
+
importlib-metadata==7.0.1
|
64 |
+
importlib_resources==6.4.0
|
65 |
+
ipykernel==6.29.4
|
66 |
+
ipython==8.23.0
|
67 |
+
isoduration==20.11.0
|
68 |
+
jedi==0.19.1
|
69 |
+
Jinja2==3.1.3
|
70 |
+
joblib==1.3.2
|
71 |
+
json5==0.9.14
|
72 |
+
jsonpointer==2.4
|
73 |
+
jsonschema==4.21.1
|
74 |
+
jsonschema-specifications==2023.12.1
|
75 |
+
jupyter-events==0.9.0
|
76 |
+
jupyter-lsp==2.2.2
|
77 |
+
jupyter_client==8.6.1
|
78 |
+
jupyter_core==5.7.2
|
79 |
+
jupyter_server==2.12.5
|
80 |
+
jupyter_server_terminals==0.5.2
|
81 |
+
jupyterlab==4.1.0
|
82 |
+
jupyterlab_pygments==0.3.0
|
83 |
+
jupyterlab_server==2.25.2
|
84 |
+
kiwisolver==1.4.5
|
85 |
+
lazy_loader==0.3
|
86 |
+
lightgbm==4.3.0
|
87 |
+
llvmlite==0.42.0
|
88 |
+
lxml==5.1.0
|
89 |
+
magic-filter==1.0.12
|
90 |
+
Mako==1.3.2
|
91 |
+
markdown-it-py==3.0.0
|
92 |
+
MarkupSafe==2.1.5
|
93 |
+
matplotlib==3.8.3
|
94 |
+
matplotlib-inline==0.1.6
|
95 |
+
mdurl==0.1.2
|
96 |
+
mistune==3.0.2
|
97 |
+
mpmath==1.3.0
|
98 |
+
multidict==6.0.5
|
99 |
+
multitasking==0.0.11
|
100 |
+
nbclient==0.9.0
|
101 |
+
nbconvert==7.15.0
|
102 |
+
nbformat==5.9.2
|
103 |
+
nest-asyncio==1.6.0
|
104 |
+
networkx==3.2.1
|
105 |
+
nltk==3.8.1
|
106 |
+
notebook_shim==0.2.3
|
107 |
+
numba==0.59.0
|
108 |
+
numpy==1.26.4
|
109 |
+
opencv-python==4.9.0.80
|
110 |
+
openpyxl==3.1.2
|
111 |
+
optuna==3.5.0
|
112 |
+
overrides==7.7.0
|
113 |
+
packaging==24.0
|
114 |
+
pandas==2.2.0
|
115 |
+
pandocfilters==1.5.1
|
116 |
+
parso==0.8.4
|
117 |
+
patsy==0.5.6
|
118 |
+
peewee==3.17.1
|
119 |
+
pexpect==4.9.0
|
120 |
+
pillow==10.2.0
|
121 |
+
platformdirs==4.2.0
|
122 |
+
plotly==5.19.0
|
123 |
+
prometheus-client==0.19.0
|
124 |
+
prompt-toolkit==3.0.43
|
125 |
+
prophet==1.1.5
|
126 |
+
protobuf==4.25.3
|
127 |
+
psutil==5.9.8
|
128 |
+
ptyprocess==0.7.0
|
129 |
+
pure-eval==0.2.2
|
130 |
+
py-cpuinfo==9.0.0
|
131 |
+
pyarrow==15.0.0
|
132 |
+
pycparser==2.21
|
133 |
+
pydantic==2.5.3
|
134 |
+
pydantic_core==2.14.6
|
135 |
+
pydeck==0.8.1b0
|
136 |
+
pyenchant==3.2.2
|
137 |
+
Pygments==2.17.2
|
138 |
+
pymystem3==0.2.0
|
139 |
+
pynndescent==0.5.11
|
140 |
+
pyparsing==3.1.1
|
141 |
+
python-dateutil==2.9.0.post0
|
142 |
+
python-json-logger==2.0.7
|
143 |
+
pytz==2024.1
|
144 |
+
PyYAML==6.0.1
|
145 |
+
pyzmq==25.1.2
|
146 |
+
referencing==0.33.0
|
147 |
+
regex==2023.12.25
|
148 |
+
requests==2.31.0
|
149 |
+
rfc3339-validator==0.1.4
|
150 |
+
rfc3986-validator==0.1.1
|
151 |
+
rich==13.7.0
|
152 |
+
rpds-py==0.17.1
|
153 |
+
safetensors==0.4.2
|
154 |
+
scikit-image==0.22.0
|
155 |
+
scikit-learn==1.4.0
|
156 |
+
scipy==1.12.0
|
157 |
+
seaborn==0.13.2
|
158 |
+
Send2Trash==1.8.2
|
159 |
+
setuptools==69.2.0
|
160 |
+
six==1.16.0
|
161 |
+
smart-open==7.0.4
|
162 |
+
smmap==5.0.1
|
163 |
+
sniffio==1.3.0
|
164 |
+
soupsieve==2.5
|
165 |
+
SQLAlchemy==2.0.28
|
166 |
+
stack-data==0.6.3
|
167 |
+
stanio==0.3.0
|
168 |
+
statsmodels==0.14.1
|
169 |
+
streamlit==1.31.1
|
170 |
+
stumpy==1.12.0
|
171 |
+
sympy==1.12
|
172 |
+
tenacity==8.2.3
|
173 |
+
terminado==0.18.0
|
174 |
+
thop==0.1.1.post2209072238
|
175 |
+
threadpoolctl==3.2.0
|
176 |
+
tifffile==2024.2.12
|
177 |
+
tinycss2==1.2.1
|
178 |
+
tokenizers==0.15.2
|
179 |
+
toml==0.10.2
|
180 |
+
toolz==0.12.1
|
181 |
+
torch==2.2.2
|
182 |
+
torchaudio==2.2.2
|
183 |
+
torchvision==0.17.2
|
184 |
+
tornado==6.4
|
185 |
+
tqdm==4.66.2
|
186 |
+
traitlets==5.14.2
|
187 |
+
transformers==4.39.3
|
188 |
+
translit==0.2a1
|
189 |
+
transliterate==1.10.2
|
190 |
+
tsfresh==0.20.2
|
191 |
+
types-python-dateutil==2.8.19.20240106
|
192 |
+
typing_extensions==4.9.0
|
193 |
+
tzdata==2024.1
|
194 |
+
tzlocal==5.2
|
195 |
+
ultralytics==8.1.42
|
196 |
+
umap==0.1.1
|
197 |
+
umap-learn==0.5.5
|
198 |
+
uri-template==1.3.0
|
199 |
+
urllib3==2.2.0
|
200 |
+
validators==0.22.0
|
201 |
+
virtualenv==20.25.1
|
202 |
+
wcwidth==0.2.13
|
203 |
+
webcolors==1.13
|
204 |
+
webencodings==0.5.1
|
205 |
+
websocket-client==1.7.0
|
206 |
+
wrapt==1.16.0
|
207 |
+
xgboost==2.0.3
|
208 |
+
xlrd==2.0.1
|
209 |
+
xyzservices==2023.10.1
|
210 |
+
yarl==1.9.4
|
211 |
+
yellowbrick==1.5
|
212 |
+
yfinance==0.2.36
|
213 |
+
zipp==3.17.0
|
rnn_preprocessing.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
stop_words = set(stopwords.words('russian'))
|
8 |
+
|
9 |
+
def data_preprocessing(text: str) -> str:
|
10 |
+
"""preprocessing string: lowercase, removing html-tags, punctuation,
|
11 |
+
stopwords, digits
|
12 |
+
|
13 |
+
Args:
|
14 |
+
text (str): input string for preprocessing
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: preprocessed string
|
18 |
+
"""
|
19 |
+
|
20 |
+
text = text.lower()
|
21 |
+
text = re.sub('<.*?>', '', text) # html tags
|
22 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
23 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
24 |
+
text = [word for word in text.split() if not word.isdigit()]
|
25 |
+
text = ' '.join(text)
|
26 |
+
return text
|
27 |
+
|
28 |
+
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
|
29 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
30 |
+
|
31 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
32 |
+
"""Make left-sided padding for input list of tokens
|
33 |
+
|
34 |
+
Args:
|
35 |
+
review_int (list): input list of tokens
|
36 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
np.array: padded sequences
|
40 |
+
"""
|
41 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
42 |
+
for i, review in enumerate(review_int):
|
43 |
+
if len(review) <= seq_len:
|
44 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
45 |
+
new = zeros + review
|
46 |
+
else:
|
47 |
+
new = review[: seq_len]
|
48 |
+
features[i, :] = np.array(new)
|
49 |
+
|
50 |
+
return features
|
51 |
+
|
52 |
+
def preprocess_single_string(
|
53 |
+
input_string: str,
|
54 |
+
seq_len: int,
|
55 |
+
vocab_to_int: dict,
|
56 |
+
verbose : bool = False
|
57 |
+
) -> torch.tensor:
|
58 |
+
"""Function for all preprocessing steps on a single string
|
59 |
+
|
60 |
+
Args:
|
61 |
+
input_string (str): input single string for preprocessing
|
62 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
63 |
+
vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
list: preprocessed string
|
67 |
+
"""
|
68 |
+
|
69 |
+
preprocessed_string = data_preprocessing(input_string)
|
70 |
+
result_list = []
|
71 |
+
for word in preprocessed_string.split():
|
72 |
+
try:
|
73 |
+
result_list.append(vocab_to_int[word])
|
74 |
+
except KeyError as e:
|
75 |
+
if verbose:
|
76 |
+
print(f'{e}: not in dictionary!')
|
77 |
+
pass
|
78 |
+
result_padded = padding([result_list], seq_len)[0]
|
79 |
+
|
80 |
+
return torch.tensor(result_padded)
|
strim_nlp.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from catboost import CatBoostClassifier
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
from pymystem3 import Mystem
|
8 |
+
from joblib import load
|
9 |
+
import nltk
|
10 |
+
|
11 |
+
def data_preprocessing(text):
|
12 |
+
stop_words = set(stopwords.words('russian'))
|
13 |
+
text = text.lower()
|
14 |
+
text = re.sub("<.*?>", "", text)
|
15 |
+
text = re.sub(r'http\S+', " ", text)
|
16 |
+
text = re.sub(r'@\w+', ' ', text)
|
17 |
+
text = re.sub(r'#\w+', ' ', text)
|
18 |
+
text = re.sub(r'\d+', ' ', text)
|
19 |
+
text = "".join([c for c in text if c not in string.punctuation])
|
20 |
+
return " ".join([word for word in text.split() if word not in stop_words])
|
21 |
+
|
22 |
+
def lemmatize_text(text):
|
23 |
+
mystem = Mystem()
|
24 |
+
lemmas = mystem.lemmatize(text)
|
25 |
+
return ' '.join(lemmas)
|
26 |
+
|
27 |
+
model = CatBoostClassifier()
|
28 |
+
model.load_model('/Users/olgaseina/Desktop/NLP_project/cat_model4.cbm')
|
29 |
+
|
30 |
+
tfidf_vectorizer = load('/Users/olgaseina/Desktop/NLP_project/tfidf_vectorizer.joblib')
|
31 |
+
|
32 |
+
def classic_ml_page():
|
33 |
+
st.title("Классификация отзывов о медицинских учреждениях")
|
34 |
+
user_review = st.text_area("Введите ваш отзыв здесь:")
|
35 |
+
|
36 |
+
if st.button("Классифицировать"):
|
37 |
+
if user_review:
|
38 |
+
preprocessed_review = data_preprocessing(user_review)
|
39 |
+
lemmatized_review = lemmatize_text(preprocessed_review)
|
40 |
+
vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
|
41 |
+
prediction = model.predict(vectorized_review)
|
42 |
+
|
43 |
+
if prediction[0] == 1:
|
44 |
+
st.write("Позитивный отзыв 😀")
|
45 |
+
else:
|
46 |
+
st.write("Негативный отзыв 😟")
|
47 |
+
else:
|
48 |
+
st.write("Пожалуйста, введите отзыв для классификации.")
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
tfidf_vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
|
3 |
+
size 1750676
|
toxic1.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# toxic.py
|
2 |
+
import streamlit as st
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import time
|
6 |
+
import torch
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
|
9 |
+
# Ensure your model and tokenizer paths are correct and accessible by the Streamlit app.
|
10 |
+
# Since you're importing this into another file, relative or absolute paths might need to be updated accordingly.
|
11 |
+
model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
|
12 |
+
tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
|
13 |
+
model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
|
14 |
+
|
15 |
+
def text2toxicity(text, aggregate=True):
|
16 |
+
with torch.no_grad():
|
17 |
+
inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
|
18 |
+
proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
|
19 |
+
if isinstance(text, str):
|
20 |
+
proba = proba[0]
|
21 |
+
if aggregate:
|
22 |
+
return 1 - proba.T[0] * (1 - proba.T[-1])
|
23 |
+
return proba
|
24 |
+
|
25 |
+
def toxicity_page():
|
26 |
+
st.title("""
|
27 |
+
Определим токсичный комментарий или нет
|
28 |
+
""")
|
29 |
+
|
30 |
+
user_text_input = st.text_area('Введите ваш отзыв здесь:')
|
31 |
+
|
32 |
+
if st.button('Предсказать'):
|
33 |
+
start_time = time.time()
|
34 |
+
proba = text2toxicity(user_text_input, True)
|
35 |
+
end_time = time.time()
|
36 |
+
prediction_time = end_time - start_time
|
37 |
+
|
38 |
+
if proba >= 0.5:
|
39 |
+
st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
|
40 |
+
else:
|
41 |
+
st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
|
42 |
+
st.write(f'Время предсказания: {prediction_time:.4f} секунд')
|
vocab_to_int.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|