{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2024-05-01T20:02:31.883900Z", "iopub.status.busy": "2024-05-01T20:02:31.883672Z", "iopub.status.idle": "2024-05-01T20:02:31.887224Z", "shell.execute_reply": "2024-05-01T20:02:31.886473Z", "shell.execute_reply.started": "2024-05-01T20:02:31.883877Z" }, "id": "i9FKaBPLQEqo", "tags": [] }, "outputs": [], "source": [ "# !pip install transformers==4.40.1\n", "# !pip install pymorphy2\n", "# !pip install evaluate\n", "# !pip install wordclouda" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "execution": { "iopub.execute_input": "2024-05-01T20:02:31.888519Z", "iopub.status.busy": "2024-05-01T20:02:31.888272Z", "iopub.status.idle": "2024-05-01T20:02:37.159457Z", "shell.execute_reply": "2024-05-01T20:02:37.158362Z", "shell.execute_reply.started": "2024-05-01T20:02:31.888495Z" }, "id": "YIm8hJ6Pg4Mi", "outputId": "1d7505d8-393e-484f-db12-120ca6e38a44", "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /home/appuser/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\n", "import numpy as np\n", "import pickle\n", "from tqdm.notebook import tqdm\n", "tqdm.pandas()\n", "\n", "import pymorphy2\n", "import string\n", "import re\n", "import nltk\n", "nltk.download('stopwords')\n", "from nltk.corpus import stopwords\n", "\n", "import evaluate\n", "from torch.utils.data import DataLoader, TensorDataset, Dataset\n", "from sklearn.model_selection import train_test_split\n", "from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "import torch\n", "import torch.nn as nn" ] }, { "cell_type": "markdown", "source": [ "# Подготовка данных" ], "metadata": { "id": "TTMcDFwneX03" } }, { "cell_type": "markdown", "source": [ "## Подготовка текста" ], "metadata": { "id": "DiMWNBAzecLr" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "840WjRyCqwKs", "tags": [] }, "outputs": [], "source": [ "posts = pd.concat([pd.read_csv('posts_0-30000.csv'),\n", " pd.read_csv('posts_0-30000 (1).csv'),\n", " pd.read_csv('posts_0-30000 (2).csv'),\n", " pd.read_csv('posts_0-30000 (3).csv'),\n", " pd.read_csv('posts_0-30000 (4).csv')])\\\n", " .drop('Unnamed: 0', axis = 1).drop_duplicates().dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PKtDmuFir0Fp", "tags": [] }, "outputs": [], "source": [ "posts.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZBNNvtlR3DlN", "tags": [] }, "outputs": [], "source": [ "rus_stopwords = stopwords.words('russian')\n", "morph = pymorphy2.MorphAnalyzer(probability_estimator_cls=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "42rkxqpb3DWZ", "tags": [] }, "outputs": [], "source": [ "def remove_stopwords(txt):\n", " s = ''\n", " txt = txt.strip()\n", " txt = txt.translate(str.maketrans({key: \" {0} \".format(key) for key in string.punctuation}))\n", " txt = re.sub(r'[^\\w\\s]|\\n', ' ', txt)\n", " txt = txt.lower()\n", " txt = re.sub('[^а-яА-ЯёЁ*\\W]',' ',txt)\n", " for word in txt.split():\n", " word = morph.parse(word)[0].normal_form\n", " if word not in rus_stopwords:\n", " if word not in ['также', 'весь', 'это', 'который', 'иза', 'еще', 'ещё', 'ее', 'её', 'свой']:\n", " s = s+ word + ' '\n", " s = s[:-1]\n", " return s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2xg8k_Ec2_Vf", "tags": [] }, "outputs": [], "source": [ "posts['text_norm'] = [remove_stopwords(i) for i in tqdm(posts['text'])]" ] }, { "cell_type": "markdown", "source": [ "## Подготовка целевой переменной" ], "metadata": { "id": "kWA_ZTx5enUW" } }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 501 }, "execution": { "iopub.execute_input": "2024-05-01T20:02:40.727856Z", "iopub.status.busy": "2024-05-01T20:02:40.726941Z", "iopub.status.idle": "2024-05-01T20:02:40.799382Z", "shell.execute_reply": "2024-05-01T20:02:40.798725Z", "shell.execute_reply.started": "2024-05-01T20:02:40.727810Z" }, "outputId": "1c16a1c6-ac5a-4226-c6e6-b25be91c7ec4", "tags": [], "id": "MO82YrWBcuqO" }, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "owner_id | \n", "date | \n", "views | \n", "likes | \n", "reposts | \n", "text | \n", "text_norm | \n", "конверсия | \n", "len_text | \n", "
---|---|---|---|---|---|---|---|---|---|---|
22251 | \n", "43638732 | \n", "-40316705 | \n", "1653559978 | \n", "350561 | \n", "19792 | \n", "3819 | \n", "В сети завирусилась речь британского епископа ... | \n", "сеть завируситься речь британский епископ рича... | \n", "0.078246 | \n", "58 | \n", "
18604 | \n", "45144079 | \n", "-40316705 | \n", "1663849380 | \n", "199370 | \n", "13148 | \n", "2384 | \n", "Толпы добровольцев в Чечне идут к военкоматам ... | \n", "толпа доброволец чечня идти военкомат объявить... | \n", "0.089863 | \n", "10 | \n", "
22797 | \n", "43389527 | \n", "-40316705 | \n", "1652094360 | \n", "255504 | \n", "21161 | \n", "1354 | \n", "В Норильске, несмотря на метель, жители вышли ... | \n", "норильск несмотря метель житель выйти отпраздн... | \n", "0.093419 | \n", "20 | \n", "
21788 | \n", "43811107 | \n", "-40316705 | \n", "1654853750 | \n", "334553 | \n", "24545 | \n", "3757 | \n", "В небе над подмосковным Серпуховом самолётами ... | \n", "небо подмосковный серпухов самолёт написать ро... | \n", "0.095826 | \n", "23 | \n", "
37576 | \n", "4788459 | \n", "-26284064 | \n", "1670138128 | \n", "19717 | \n", "5240 | \n", "4 | \n", "Более 4 тыс. световых декоративных конструкций... | \n", "тыс светов декоративный конструкция украсить с... | \n", "0.266166 | \n", "31 | \n", "