Spaces:

HaggiVaggi
/

nlp_project

No application file

App Files Files Community

Tatiana commited on Dec 8, 2023

Commit

0d3411a

•

1 Parent(s): 08f4a85

init

Browse files

Files changed (11) hide show

.gitattributes +2 -0
README.md +3 -3
model/config.json +47 -0
nlp_st.py +120 -0
pictures/im1.png +0 -0
requirements.txt +77 -0
task2.py +43 -0
task3.py +32 -0
tokenizer/special_tokens_map.json +7 -0
tokenizer/tokenizer_config.json +57 -0
tokenizer/vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/model.safetensors filter=lfs diff=lfs merge=lfs -text
+hunter_generator.pt filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Nlp Project
-emoji: 🐠
-colorFrom: red
-colorTo: purple
 sdk: streamlit
 sdk_version: 1.29.0
 app_file: app.py

 ---
 title: Nlp Project
+emoji: 📉
+colorFrom: blue
+colorTo: yellow
 sdk: streamlit
 sdk_version: 1.29.0
 app_file: app.py

model/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "DeepPavlov/rubert-base-cased",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 119547
+}

nlp_st.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import streamlit as st
+import io
+import imageio
+from PIL import Image
+import torch.nn as nn
+import time
+from task2 import predict_class
+from task3 import generate_text
+import tempfile
+import os
+from transformers import pipeline
+# Замените "ваш-пользователь/ваш-новый-репозиторий" на ваш новый путь на Hugging Face
+model_path = "HaggiVaggi/nlp_project"
+generator = pipeline('text-generation', model=model_path)
+st.title('Обработка естественного языка • Natural Language Processing')
+with st.sidebar:
+    st.header('Выберите страницу')
+    page = st.selectbox("Выберите страницу", ["Главная", "Отзывы на рестораны",\
+                        "Тематика новостей", "GPT by GPT-team", "Итоги"])
+if page == "Главная":
+    st.header('Выполнила команда "GPT":')
+    st.subheader('🦁Рома')
+    st.subheader('🐯Руслан')
+    st.subheader('🐱Тата')
+    st.header(" 🌟 " * 10)
+    st.header('Наши задачи:')
+    st.subheader('*Задача №1*: Классификация отзыва на рестораны')
+    st.subheader('*Задача №2*: Классификация тематики новостей из телеграм каналов')
+    st.subheader('*Задача №2*: Генерация текста GPT-моделью по пользовательскому prompt')
+elif page == "Отзывы на рестораны":
+    st.header("Отзывы на рестораны:")
+elif page == "Тематика новостей":
+    st.header("Тематика новостей:")
+    st.markdown(f"<span style='font-size:{30}px; color:purple'>{'Модель: DeepPavlov/rubert-base-cased'}</span>", unsafe_allow_html=True)
+    st.info('Модель основана на архитектуре BERT (Bidirectional Encoder Representations from Transformers), представленной в [статье](https://arxiv.org/abs/1810.04805)')
+    st.info('Rubert-base-cased: "cased" означает, что в этой модели сохранен регистр слов. Это важно для русского языка, где регистр может влиять на смысл слов.')
+    st.info('В библиотеке [Transformers от Hugging Face](https://huggingface.co/DeepPavlov/rubert-base-cased), слой классификации представляется в виде BertForSequenceClassification. Этот классификатор добавляется к основной модели BERT и обучается на конкретной задаче классификации текста.')
+    user_input = st.text_area('Введите текст поста и мы узнаем, к какой тематике его отнести:')
+    if st.button("Предсказать"):
+        pred = predict_class(user_input)
+        st.subheader("Это текст по теме:" )
+        st.markdown(f'<span style="font-size:{25}px; color:pink">{pred}</span>', unsafe_allow_html=True)
+        st.subheader("Accuracy и Loss на 5 эпохах" )
+        image_1 = imageio.imread('pictures/im1.png')[:, :, :]
+        st.image(image_1)
+elif page == "GPT by GPT-team":
+    st.header("GPT by GPT-team:")
+    st.markdown(f"<span style='font-size:{30}px; color:green'>{'Модель: GPT2LMHeadModel'}</span>", unsafe_allow_html=True)
+    st.info('[GPT2LMHeadModel](https://huggingface.co/docs/transformers/model_doc/gpt2) - это модель, способная генерировать текст, учитывая предшествующий контекст.')
+    st.info('[Sberbank-ai/rugpt3small_based_on_gpt2](https://huggingface.co/ai-forever/rugpt3small_based_on_gpt2): Это конкретная предобученная модель GPT-2, которая была дообучена на русском языке командой Sber AI.\
+             Она обладает способностью генерировать текст, принимая на вход текстовый контекст.')
+    user_input2 = st.text_area("Введите текст:", "")
+    if st.button("Сгенерировать"):
+        generated = generate_text(user_input2)
+        st.subheader("Сгенерированный текст:")
+        st.markdown(f'<span style="font-size:{25}px; color:green">{generated}</span>', unsafe_allow_html=True)
+    # st.subheader("- Модель: *ConvAutoencoder()*")
+    # st.subheader("- Количество эпох обучения: *100*")
+    # st.info('Расширение картинки должно быть в формате .jpg /.jpeg /.png')
+    # image_url2 = st.text_input("Введите URL изображения")
+    # start_time2 = time.time()
+    # if image_url2:
+    #     # Загрузка изображения по ��сылке
+    #     response2 = requests.get(image_url2)
+    #     image2 = Image.open(io.BytesIO(response2.content))
+    #     st.subheader('Ваше фото до обработки:')
+    #     st.image(image2)
+    #     prediction_result = predict_1(image2)
+    #     show_result_button3 = st.button("Показать результат", key="result_button_3")
+    #     if show_result_button3:
+    #         st.success("Ваш результат готов!")
+    #         st.subheader("Ваше фото после обработки:")
+    #         st.image(prediction_result, channels='GRAY')
+    #         st.subheader(f'Время предсказания: {round((time.time() - start_time2), 2)} сек.')
+    #         st.header('🎈' * 10)
+elif page == "Итоги":
+    st.header('Результаты и выводы')
+#     st.subheader('*Задача №1*: Детектирование ветряных мельниц')
+#     st.subheader("Метрики из Clear ML")
+#     image_1 = Image.open("pictures/P_curve.png")
+#     image_2 = Image.open("pictures/PR_curve.png")
+#     image_3 = Image.open("pictures/R_curve.png")
+#     image_4 = Image.open("pictures/F1_curve.png")
+# # Отображаем изображения в одной строке
+#     st.image([image_1, image_2, image_3, image_4], caption=['Image 1 - P_curve', 'Image 2 - PR_curve', 'Image 3 - R_curve', 'Image 4 - F1_curve'], width=300)
+#     st.subheader("Результативные графики из Clear ML")
+#     image_5 = imageio.imread('pictures/plots.jpg')[:, :, :]
+#     st.image(image_5)

pictures/im1.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+altair==5.2.0
+attrs==23.1.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+filelock==3.13.1
+fsspec==2023.12.1
+gitdb==4.0.11
+GitPython==3.1.40
+huggingface-hub==0.19.4
+idna==3.6
+imageio==2.33.0
+importlib-metadata==6.11.0
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.2
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+packaging==23.2
+pandas==2.1.3
+Pillow==10.1.0
+protobuf==4.25.1
+pyarrow==14.0.1
+pydeck==0.8.1b0
+Pygments==2.17.2
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.2
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+sentencepiece==0.1.99
+six==1.16.0
+smmap==5.0.1
+streamlit==1.29.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+toml==0.10.2
+toolz==0.12.0
+torch==2.1.1
+tornado==6.4
+tqdm==4.66.1
+transformers==4.35.2
+triton==2.1.0
+typing_extensions==4.8.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+watchdog==3.0.0
+zipp==3.17.0

task2.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+from sklearn.preprocessing import LabelEncoder
+#Загрузка сохраненной модели и токенизатора в Streamlit
+loaded_model_path = "/home/tata/DS_bootcamp/ds-phase-2/10-nlp/project4/model"
+loaded_tokenizer_path = "/home/tata/DS_bootcamp/ds-phase-2/10-nlp/project4/tokenizer"
+loaded_model = BertForSequenceClassification.from_pretrained(loaded_model_path)
+loaded_tokenizer = BertTokenizer.from_pretrained(loaded_tokenizer_path)
+labels = ['мода', 'спорт', 'технологии', 'финансы', 'крипта']
+label_encoder = LabelEncoder()
+label_encoder.fit(labels)
+def predict_class(user_input, model=loaded_model, tokenizer=loaded_tokenizer, label_encoder=label_encoder, max_length=128):
+    if not user_input:
+        return "Введите текст"
+    def tokenize_text(text):
+        encoded_text = tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=max_length,
+            pad_to_max_length=True,
+            return_attention_mask=True,
+            return_tensors='pt'
+        )
+        return encoded_text
+    encoded_text = tokenize_text(user_input)
+    with torch.no_grad():
+        model.eval()
+        input_ids = encoded_text['input_ids']
+        attention_mask = encoded_text['attention_mask']
+        outputs = model(input_ids, attention_mask=attention_mask)
+    logits = outputs.logits
+    predicted_class_index = torch.argmax(logits, dim=1).item()
+    # Получение названия класса
+    predicted_class = label_encoder.classes_[predicted_class_index]
+    return predicted_class

task3.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import streamlit as st
+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+model_name_or_path = "sberbank-ai/rugpt3small_based_on_gpt2"
+tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
+model =  GPT2LMHeadModel.from_pretrained(
+    model_name_or_path,
+    output_attentions = False,
+    output_hidden_states = False,
+)
+# Загрузка сохраненных весов
+model_weights_path = "/home/tata/DS_bootcamp/ds-phase-2/10-nlp/project4/hunter_generator.pt"
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.load_state_dict(torch.load(model_weights_path, map_location=device))
+model.eval()
+def generate_text(user_input, model=model, tokenizer=tokenizer):
+    input_ids = tokenizer.encode(user_input, return_tensors="pt")
+    with torch.no_grad():
+        out = model.generate(
+            input_ids,
+            do_sample=True,
+            num_beams=3,
+            temperature=1.05,
+            top_p=.8,
+            max_length=50,
+        )
+    generated_text = list(map(tokenizer.decode, out))[0]
+    return generated_text

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

tokenizer/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff