Spaces:

ElijahDi
/

nlp

Sleeping

App Files Files Community

ElijahDi commited on Feb 2, 2024

Commit

ed0e769

verified ·

1 Parent(s): f0c41ac

Upload 11 files

Browse files

Files changed (11) hide show

BERT_base_model.pkl +3 -0
classifier_bag.pkl +3 -0
classifier_tf.pkl +3 -0
lstm_model.pth +3 -0
main.py +189 -0
model_lstm.py +40 -0
preprocessing.py +98 -0
requirements.txt +36 -0
vectorizer_bag.joblib +3 -0
vectorizer_tf.joblib +3 -0
vocab_lstm.json +0 -0

BERT_base_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d279897d59dadf5867bd256912786e66012cbb5a335ba9f6ef139e68d87f055
+size 6991

classifier_bag.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b645bb55ece68eae1846ea54da15f4d0b241b0d8bfce1292b9922b3c381dfb2b
+size 658287

classifier_tf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa416d91b4cd841e0709fa8da8ef59a94d386eee0a6d796edda5e0057182d4b7
+size 658287

lstm_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b309ab4465c41ebdd2208525b33364e29cf3d8b522a9ae9f29685443742a13c8
+size 919778

main.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import time
+import torch
+import torch.nn as nn
+from torch import tensor
+import joblib
+from dataclasses import dataclass
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+import json
+from preprocessing import predict_review, data_preprocessing_hard
+from model_lstm import LSTMClassifier
+# from BERT_inputs import BertInputs
+device = 'cpu'
+classifier_bag = joblib.load('classifier_bag.pkl')
+classifier_tf = joblib.load('classifier_tf.pkl')
+BERT_lin_cl = joblib.load('BERT_base_model.pkl')
+selected_model = st.sidebar.radio("Зачем пришел?", ("Классифиция отзывов лечебных учреждений",
+                                                    "Оценка степени токсичности пользовательского сообщения",
+                                                    "Генерация текста GPT-моделью по пользовательскому prompt"))
+# Классификация отзыва на поликлиники
+model_options = ["BagOfWords", "TF-IDF", "LSTM", "BERT-based-ru"]
+if selected_model == "Классифиция отзывов лечебных учреждений":
+    st.title("""
+    Приложение классифицирует твой отзыв и подскажет позитивный он или негативный
+    """)
+    st.write("""
+    Классификация происходит с использованием классических ML моделей, нейросетевой модели LSTM,
+             и, как вариант, с использованием нейросетевой модели Bert-basic-ru для векторизации и линейной
+             регрессии для классификации.
+    """)
+    vectorizer_1 = joblib.load('vectorizer_bag.joblib')
+    vectorizer_2 = joblib.load('vectorizer_tf.joblib')
+    # LSTM
+    with open('vocab_lstm.json', 'r') as file:
+        vocab_to_int = json.load(file)
+    @dataclass
+    class ConfigRNN:
+        vocab_size: int
+        device : str
+        n_layers : int
+        embedding_dim : int
+        hidden_size : int
+        seq_len : int
+        bidirectional : bool or int
+    net_config = ConfigRNN(
+        vocab_size = len(vocab_to_int)+1,
+        device='cpu',
+        n_layers=2,
+        embedding_dim=64,
+        hidden_size=32,
+        seq_len = 100,
+        bidirectional=False)
+    lstm = LSTMClassifier(net_config)
+    lstm.load_state_dict(torch.load('lstm_model.pth', map_location=device))
+    lstm.to(device)
+    # lstm.eval()
+    # BERT
+    tokenizer = AutoTokenizer.from_pretrained("Geotrend/bert-base-ru-cased")
+    model = AutoModel.from_pretrained("Geotrend/bert-base-ru-cased")
+    # model.eval()
+    MAX_LEN = 200
+    data = pd.DataFrame({
+    'Модель': ["BagOfWords", "TF-IDF", "LSTM", "BERT-based-ru"],
+    'f1_macro': [0.934, 0.939, 0.009, 0.845]
+    })
+    st.subheader("""
+    Немного информации о точности используемых моделей после обучения:
+    """)
+    # st.write(data)
+    st.table(data)
+    user_text_input = st.text_area('Введите ваш отзыв здесь:', '')
+    selected_model_name = st.selectbox('Выберите модель:', model_options, index=0)
+    if st.button('Предсказать'):
+        start_time = time.time()
+        if selected_model_name == "BagOfWords":
+            X = vectorizer_1.transform([data_preprocessing_hard(user_text_input)])
+            predictions = classifier_bag.predict(X)
+        elif selected_model_name == "TF-IDF":
+            X = vectorizer_2.transform([data_preprocessing_hard(user_text_input)])
+            predictions = classifier_tf.predict(X)
+        elif selected_model_name == "LSTM":
+            predictions = predict_review(model=lstm, review_text=user_text_input, net_config=net_config,
+                        vocab_to_int=vocab_to_int)
+        elif selected_model_name == "BERT-based-ru":
+            tokens = tokenizer.encode(user_text_input, add_special_tokens=True)
+            padded_tokens = tokens + [0] * (MAX_LEN - len(tokens))
+            input_tensor = tensor(padded_tokens).unsqueeze(0)
+            with torch.no_grad():
+                outputs = model(input_tensor)
+            X = outputs.last_hidden_state[:,0,:].detach().cpu().numpy()
+            predictions = BERT_lin_cl.predict(X)
+            pass
+        end_time = time.time()
+        prediction_time = end_time - start_time
+        model_message = f'Предсказание модели {selected_model_name}:'
+        if predictions >= 0.5:
+            # st.write(f'{model_message} кажется это положительный комментарий.')
+            gif_url = 'https://media2.giphy.com/media/v1.Y2lkPTc5MGI3NjExOTdnYjJ1eTE0bjRuMGptcjhpdTk2YTYzeXEzMzlidWFsamY2bW8wZyZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/LUg1GEjapflW7Vg6B9/giphy.gif'
+            st.image(gif_url, caption="Позитивный коментарий")
+        else:
+            # st.write(f'{model_message} кажется это негативный комментарий.')
+            gif_url = 'https://i.gifer.com/LdC3.gif'
+            st.image(gif_url, caption="Негативный коментарий")
+        st.write(f'Время предсказания: {prediction_time:.4f} секунд')
+# Оценка степени токсичности пользовательского сообщения
+elif selected_model == "Оценка степени токсичности пользовательского сообщения":
+    st.title("""
+    Приложение классифицирует токсичный комментарий или нет
+    """)
+    st.write("""
+    Классификация происходит с использованием нейросетевой модели rubert-tiny-toxicity.
+    """)
+    # Toxicity
+    model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
+    tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
+    model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
+    def text2toxicity(text, aggregate=True):
+        with torch.no_grad():
+            inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to(model_t.device)
+            proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
+        if isinstance(text, str):
+            proba = proba[0]
+        if aggregate:
+            return 1 - proba.T[0] * (1 - proba.T[-1])
+        return proba
+    user_text_input = st.text_area('Введите ваш отзыв здесь:')
+    if st.button('Предсказать'):
+        start_time = time.time()
+        proba = text2toxicity(user_text_input, True)
+        end_time = time.time()
+        prediction_time = end_time - start_time
+        model_message = f'Предсказание модели:'
+        if proba >= 0.5:
+            # st.write(f' Кажется это токсичный комментарий.')
+            gif_url = "https://media1.giphy.com/media/cInbau65cwPWUeGTIZ/giphy.gif?cid=6c09b952seqdtvky8yn2uq6bt3kvo1vu5sdzpkdznjvmtxsh&ep=v1_internal_gif_by_id&rid=giphy.gif&ct=s"
+            st.image(gif_url, caption="ТОКСИК")
+        else:
+            # st.write(f' Кажется это не токсичный комментарий.')
+            gif_url = 'https://i.gifer.com/origin/51/518fbbf9cf32763122f9466d3c686bb3_w200.gif'
+            st.image(gif_url, caption="МИЛОТА")
+        st.write(f'Время предсказания: {prediction_time:.4f} секунд')
+# Генерация текста GPT-моделью
+elif selected_model == "Генерация текста GPT-моделью по пользовательскому prompt":
+    st.title("""
+    Приложение генерирует текст по Вашему промту
+    """)
+    st.write("""
+    Для генерации текста используется предобученная сеть GPT.
+    """)
+    uploaded_img = st.sidebar.file_uploader('Загрузи свое космофото', type=["jpg", "png", "jpeg"])
+    if uploaded_img is not None:
+        input_img = io.imread(uploaded_img)
+    else:
+        input_img = io.imread('/Users/id/Documents/strlit/cv_project/Segm.jpg')

model_lstm.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn as nn
+class LSTMClassifier(nn.Module):
+    def __init__(self, rnn_conf) -> None:
+        super().__init__()
+        self.embedding_dim   = rnn_conf.embedding_dim
+        self.hidden_size     = rnn_conf.hidden_size
+        self.bidirectional   = rnn_conf.bidirectional
+        self.n_layers        = rnn_conf.n_layers
+        self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
+        self.lstm = nn.LSTM(
+            input_size    = self.embedding_dim,
+            hidden_size   = self.hidden_size,
+            bidirectional = self.bidirectional,
+            batch_first   = True,
+            num_layers    = self.n_layers
+        )
+        self.bidirect_factor = 2 if self.bidirectional else 1
+        self.clf = nn.Sequential(
+            nn.Linear(self.hidden_size * self.bidirect_factor, 32),
+            nn.Tanh(),
+            nn.Dropout(),
+            nn.Linear(32, 1)
+        )
+    def model_description(self):
+        direction = 'bidirect' if self.bidirectional else 'onedirect'
+        return f'lstm_{direction}_{self.n_layers}'
+    def forward(self, x: torch.Tensor):
+        embeddings = self.embedding(x)
+        out, _ = self.lstm(embeddings)
+        out = out[:, -1, :] # [все элементы батча, последний h_n, все элементы последнего h_n]
+        out = self.clf(out.squeeze())
+        return out

preprocessing.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import re
+import string
+import numpy as np
+import torch
+import nltk
+import pymorphy2
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+stop_words = set(stopwords.words('russian'))
+morph = pymorphy2.MorphAnalyzer()
+def data_preprocessing_hard(text: str) -> str:
+    text = text.lower()
+    text = re.sub('<.*?>', '', text)
+    text = re.sub(r'[^а-яА-Я\s]', '', text)
+    text = ''.join([c for c in text if c not in string.punctuation])
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    # text = ''.join([char for char in text if not char.isdigit()])
+    text = ' '.join([morph.parse(word)[0].normal_form for word in text.split()])
+    return text
+def data_preprocessing(text: str) -> str:
+    """preprocessing string: lowercase, removing html-tags, punctuation and stopwords
+    Args:
+        text (str): input string for preprocessing
+    Returns:
+        str: preprocessed string
+    """
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = [word for word in text.split() if word not in stop_words]
+    text = ' '.join(text)
+    return text
+def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
+    """Make left-sided padding for input list of tokens
+    Args:
+        review_int (list): input list of tokens
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+    Returns:
+        np.array: padded sequences
+    """
+    features = np.zeros((len(review_int), seq_len), dtype = int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose : bool = False
+    ) -> torch.tensor:
+    """Function for all preprocessing steps on a single string
+    Args:
+        input_string (str): input single string for preprocessing
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+        vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
+    Returns:
+        list: preprocessed string
+    """
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return torch.tensor(result_padded)
+def predict_review(model, review_text: str, net_config, vocab_to_int) -> torch.tensor:
+        sample = preprocess_single_string(review_text, net_config.seq_len, vocab_to_int)
+        probability_lstm = model(sample.unsqueeze(0)).to(net_config.device).sigmoid()
+        return probability_lstm.item()

requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses==0.6
+DAWG-Python==0.7.2
+docopt==0.6.2
+filelock==3.13.1
+fsspec==2023.12.2
+huggingface-hub==0.20.3
+idna==3.6
+Jinja2==3.1.3
+joblib==1.3.2
+MarkupSafe==2.1.4
+mpmath==1.3.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+packaging==23.2
+pandas==2.2.0
+pymorphy2==0.9.1
+pymorphy2-dicts-ru==2.4.417127.4579844
+python-dateutil==2.8.2
+pytz==2024.1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+safetensors==0.4.2
+six==1.16.0
+sympy==1.12
+tokenizers==0.15.1
+torch==2.2.0
+tqdm==4.66.1
+transformers==4.37.2
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.2.0

vectorizer_bag.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78c823e4a6a3f06b1961a0d4e28e21be547839ed9606ed879d56315d1d7c01b2
+size 3357923

vectorizer_tf.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78c823e4a6a3f06b1961a0d4e28e21be547839ed9606ed879d56315d1d7c01b2
+size 3357923

vocab_lstm.json ADDED Viewed

The diff for this file is too large to render. See raw diff