{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "S52EVP7k-rl7"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import torch\n",
        "import re\n",
        "import string\n",
        "import numpy as np\n",
        "import streamlit as st\n",
        "import faiss # хранение индексов\n",
        "from tqdm import tqdm\n",
        "from transformers import AutoTokenizer, AutoModel\n",
        "from joblib import dump, load # Для сохранения/загрузки эмбэддингов"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "12BEEwcF-rl9"
      },
      "outputs": [],
      "source": [
        "path = '/content/movies_filtered.csv' # ИЗМЕНИ ТУТ ПУТЬ!\n",
        "a\n",
        "df = pd.read_csv(path)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "df5lg8-m-rl-"
      },
      "outputs": [],
      "source": [
        "def clean(text):\n",
        "    text = text.lower()  # Нижний регистр\n",
        "    text = re.sub(r'\\d+', ' ', text)  # Удаляем числа\n",
        "    # text = text.translate(str.maketrans('', '', string.punctuation))  # Удаляем пунктуацию\n",
        "    text = re.sub(r'\\s+', ' ', text)  # Удаляем лишние пробелы\n",
        "    text = text.strip()  # Удаляем начальные и конечные пробелы\n",
        "    text = re.sub(r'\\s+|\\n', ' ', text) # Удаляет \\n и \\xa0\n",
        "    # text = re.sub(r'\\b\\w{1,2}\\b', '', text)  # Удаляем слова длиной менее 3 символов\n",
        "    # Дополнительные шаги, которые могут быть полезны в данном контексте:\n",
        "    # text = re.sub(r'\\b\\w+\\b', '', text)  # Удаляем отдельные слова (без чисел и знаков препинания)\n",
        "    # text = ' '.join([word for word in text.split() if word not in stop_words])  # Удаляем стоп-слова\n",
        "    return text\n",
        "\n",
        "for i, row in df.iterrows():\n",
        "    df.at[i, 'description'] = clean(row['description'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0huKeMs4-rl_",
        "outputId": "8659997c-9b8a-45bb-e2d7-fcc05422b92a"
      },
      "outputs": [],
      "source": [
        "# pip install transformers sentencepiece\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
        "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
        "# model.cuda()  # uncomment it if you have a GPU"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "metadata": {
        "id": "Xsxq-Ohx-rmA"
      },
      "outputs": [],
      "source": [
        "# применяем токенизатор:\n",
        "# -≥ add_special_tokens = добавляем служебные токены (CLS=101, EOS=102)\n",
        "# -≥ truncation = обрезаем по максимальной длине\n",
        "# -≥ max_length = максимальная длина последовательности\n",
        "tokenized = df['description'].apply((lambda x: tokenizer.encode(x,\n",
        "                                                                      add_special_tokens=True,\n",
        "                                                                      truncation=True,\n",
        "                                                                      max_length=1024)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 21,
      "metadata": {
        "id": "OuaXqHNj-rmB"
      },
      "outputs": [],
      "source": [
        "max_len = 1024\n",
        "# Делаю пэддинг чтобы добить до max_len последовательности\n",
        "padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])\n",
        "# И маску чтобы не применять self-attention на pad\n",
        "attention_mask = np.where(padded != 0, 1, 0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "metadata": {
        "id": "h3bfQh2o-rmC"
      },
      "outputs": [],
      "source": [
        "# Датасет для массивов\n",
        "class BertInputs(torch.utils.data.Dataset):\n",
        "    def __init__(self, tokenized_inputs, attention_masks):\n",
        "        super().__init__()\n",
        "        self.tokenized_inputs = tokenized_inputs\n",
        "        self.attention_masks = attention_masks\n",
        "\n",
        "    def __len__(self):\n",
        "        return self.tokenized_inputs.shape[0]\n",
        "\n",
        "    def __getitem__(self, idx):\n",
        "        ids = self.tokenized_inputs[idx]\n",
        "        ams = self.attention_masks[idx]\n",
        "\n",
        "        return ids, ams\n",
        "\n",
        "dataset = BertInputs(padded, attention_mask)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 23,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Q7yYgEP3-rmC",
        "outputId": "76047d40-f793-4cef-fc02-b98b232661f8"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "torch.Size([100, 1024]) torch.Size([100, 1024])\n"
          ]
        }
      ],
      "source": [
        "#DataLoader чтобы отправлять бачи в цикл обучения\n",
        "loader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=True)\n",
        "sample_ids, sample_ams = next(iter(loader))\n",
        "print(sample_ids.shape, sample_ams.shape)\n",
        "\n",
        "# shape BATCH_SIZE x MAX_LEN - что заходит в BERT"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 25,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "r1h0BNy1-rmD",
        "outputId": "adea19c9-a0f2-418c-9a21-ebe8daa00077"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 94/94 [01:13<00:00,  1.28it/s]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 1min 10s, sys: 145 ms, total: 1min 10s\n",
            "Wall time: 1min 13s\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "\n",
        "vectors_in_batch = []\n",
        "\n",
        "# Iterate over all batches\n",
        "for inputs, attention_masks in tqdm(loader):\n",
        "    vectors_in_mini_batch = []  # Store vectors in mini-batch\n",
        "    with torch.no_grad():\n",
        "        last_hidden_states = model(inputs.cuda(), attention_mask=attention_masks.cuda())\n",
        "        vector = last_hidden_states[0][:,0,:].detach().cpu().numpy()\n",
        "        vectors_in_mini_batch.append(vector)\n",
        "\n",
        "    vectors_in_batch.extend(vectors_in_mini_batch)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {},
      "outputs": [],
      "source": [
        "import itertools\n",
        "\n",
        "# Open the file and load the nested list\n",
        "vectors_in_batch = load('vectors_in_batch.joblib')\n",
        "\n",
        "# Convert the nested list to an unnested list\n",
        "text_embeddings = list(itertools.chain.from_iterable(vectors_in_batch))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Сохранение эмбеддингов\n",
        "dump(vectors_in_batch, 'vectors_in_batch.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "94"
            ]
          },
          "execution_count": 17,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "len(vectors_in_batch)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "9366"
            ]
          },
          "execution_count": 9,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "len(text_embeddings)"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.4"
    },
    "orig_nbformat": 4
  },
  "nbformat": 4,
  "nbformat_minor": 0
}