{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "id": "S52EVP7k-rl7" }, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "import re\n", "import string\n", "import numpy as np\n", "import streamlit as st\n", "import faiss # хранение индексов\n", "from tqdm import tqdm\n", "from transformers import AutoTokenizer, AutoModel\n", "from joblib import dump, load # Для сохранения/загрузки эмбэддингов" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "12BEEwcF-rl9" }, "outputs": [], "source": [ "path = '/content/movies_filtered.csv' # ИЗМЕНИ ТУТ ПУТЬ!\n", "a\n", "df = pd.read_csv(path)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "df5lg8-m-rl-" }, "outputs": [], "source": [ "def clean(text):\n", " text = text.lower() # Нижний регистр\n", " text = re.sub(r'\\d+', ' ', text) # Удаляем числа\n", " # text = text.translate(str.maketrans('', '', string.punctuation)) # Удаляем пунктуацию\n", " text = re.sub(r'\\s+', ' ', text) # Удаляем лишние пробелы\n", " text = text.strip() # Удаляем начальные и конечные пробелы\n", " text = re.sub(r'\\s+|\\n', ' ', text) # Удаляет \\n и \\xa0\n", " # text = re.sub(r'\\b\\w{1,2}\\b', '', text) # Удаляем слова длиной менее 3 символов\n", " # Дополнительные шаги, которые могут быть полезны в данном контексте:\n", " # text = re.sub(r'\\b\\w+\\b', '', text) # Удаляем отдельные слова (без чисел и знаков препинания)\n", " # text = ' '.join([word for word in text.split() if word not in stop_words]) # Удаляем стоп-слова\n", " return text\n", "\n", "for i, row in df.iterrows():\n", " df.at[i, 'description'] = clean(row['description'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0huKeMs4-rl_", "outputId": "8659997c-9b8a-45bb-e2d7-fcc05422b92a" }, "outputs": [], "source": [ "# pip install transformers sentencepiece\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n", "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n", "# model.cuda() # uncomment it if you have a GPU" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "Xsxq-Ohx-rmA" }, "outputs": [], "source": [ "# применяем токенизатор:\n", "# -≥ add_special_tokens = добавляем служебные токены (CLS=101, EOS=102)\n", "# -≥ truncation = обрезаем по максимальной длине\n", "# -≥ max_length = максимальная длина последовательности\n", "tokenized = df['description'].apply((lambda x: tokenizer.encode(x,\n", " add_special_tokens=True,\n", " truncation=True,\n", " max_length=1024)))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "OuaXqHNj-rmB" }, "outputs": [], "source": [ "max_len = 1024\n", "# Делаю пэддинг чтобы добить до max_len последовательности\n", "padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])\n", "# И маску чтобы не применять self-attention на pad\n", "attention_mask = np.where(padded != 0, 1, 0)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "id": "h3bfQh2o-rmC" }, "outputs": [], "source": [ "# Датасет для массивов\n", "class BertInputs(torch.utils.data.Dataset):\n", " def __init__(self, tokenized_inputs, attention_masks):\n", " super().__init__()\n", " self.tokenized_inputs = tokenized_inputs\n", " self.attention_masks = attention_masks\n", "\n", " def __len__(self):\n", " return self.tokenized_inputs.shape[0]\n", "\n", " def __getitem__(self, idx):\n", " ids = self.tokenized_inputs[idx]\n", " ams = self.attention_masks[idx]\n", "\n", " return ids, ams\n", "\n", "dataset = BertInputs(padded, attention_mask)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Q7yYgEP3-rmC", "outputId": "76047d40-f793-4cef-fc02-b98b232661f8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([100, 1024]) torch.Size([100, 1024])\n" ] } ], "source": [ "#DataLoader чтобы отправлять бачи в цикл обучения\n", "loader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=True)\n", "sample_ids, sample_ams = next(iter(loader))\n", "print(sample_ids.shape, sample_ams.shape)\n", "\n", "# shape BATCH_SIZE x MAX_LEN - что заходит в BERT" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r1h0BNy1-rmD", "outputId": "adea19c9-a0f2-418c-9a21-ebe8daa00077" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 94/94 [01:13<00:00, 1.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 10s, sys: 145 ms, total: 1min 10s\n", "Wall time: 1min 13s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "%%time\n", "\n", "vectors_in_batch = []\n", "\n", "# Iterate over all batches\n", "for inputs, attention_masks in tqdm(loader):\n", " vectors_in_mini_batch = [] # Store vectors in mini-batch\n", " with torch.no_grad():\n", " last_hidden_states = model(inputs.cuda(), attention_mask=attention_masks.cuda())\n", " vector = last_hidden_states[0][:,0,:].detach().cpu().numpy()\n", " vectors_in_mini_batch.append(vector)\n", "\n", " vectors_in_batch.extend(vectors_in_mini_batch)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "\n", "# Open the file and load the nested list\n", "vectors_in_batch = load('vectors_in_batch.joblib')\n", "\n", "# Convert the nested list to an unnested list\n", "text_embeddings = list(itertools.chain.from_iterable(vectors_in_batch))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Сохранение эмбеддингов\n", "dump(vectors_in_batch, 'vectors_in_batch.joblib')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "94" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(vectors_in_batch)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9366" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(text_embeddings)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 0 }