{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "cellId": "nwmnsa077j0li6frzyvoc" }, "outputs": [], "source": [ "import torch\n", "import numpy as np\n", "import json\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "markdown", "metadata": { "cellId": "dfsqzm9i1rg0djpmrynj4" }, "source": [ "Для начала распарсим датасет \"по умолчанию\"." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "cellId": "g133nsvyrtio3fpwpn54m" }, "outputs": [], "source": [ "import json\n", "file = open('arxivData.json')\n", "data = json.load(file)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "cellId": "04b7jpapxwa243bwo5a9fic" }, "outputs": [], "source": [ "def trl(container):\n", " return tqdm(range(len(container)))\n", "\n", "def prepared(string):\n", " string = string.replace(\"'\", '\"')\n", " string = string.replace('None', 'null')\n", " return string" ] }, { "cell_type": "code", "execution_count": 322, "metadata": { "cellId": "5pr61iqulb4yxea8bjlhw" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "78535363d49b48889ed5beaf19af863c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=41000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dc1c7b04e9634db295cde22088934053", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=41000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "039e3d572c70452ca0710e6c61d10b80", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=41000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "titles = [data[i]['title'] for i in trl(data)]\n", "summaries = [data[i]['summary'] for i in trl(data)]\n", "tags = [[json.loads(prepared(data[i]['tag']))[j]['term'] for j in range(len(json.loads(prepared(data[i]['tag']))))] for i in trl(data)]" ] }, { "cell_type": "markdown", "metadata": { "cellId": "i36xlscxkm9t8ab4hye7c" }, "source": [ "Теперь соберем данные о тегах со страницы арксива." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "cellId": "h0nt7pj6blgyodi07nt3e" }, "outputs": [], "source": [ "import requests\n", "page = requests.get(\"https://arxiv.org/category_taxonomy\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "cellId": "eqd8ffof4m97mqffttt" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/kernel/lib/python3.8/site-packages/ml_kernel/kernel.py:872: UserWarning: The following variables cannot be serialized: soup\n", " warnings.warn(message)\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "soup = BeautifulSoup(page.content, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "cellId": "7lwv6ajhf8ewgolxzof1bm" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/kernel/lib/python3.8/site-packages/ml_kernel/kernel.py:872: UserWarning: The following variables cannot be serialized: h4_cases, soup\n", " warnings.warn(message)\n" ] } ], "source": [ "h4_cases = soup.find_all('h4')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "cellId": "ybnwppuruulh7ljycj7rh" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/kernel/lib/python3.8/site-packages/ml_kernel/kernel.py:872: UserWarning: The following variables cannot be serialized: h4_cases, html_tag, soup, tags_tags\n", " warnings.warn(message)\n" ] } ], "source": [ "tags_tags = h4_cases[1:]\n", "possible_tags = set()\n", "tag_to_id = {}\n", "id_to_tag = {}\n", "id_to_description = {}\n", "for i, html_tag in enumerate(tags_tags):\n", " idx = html_tag.get_text().find(' ')\n", " tag = html_tag.get_text()[:idx]\n", " description = html_tag.get_text()[idx+2:-1]\n", " possible_tags.add(tag)\n", " tag_to_id[tag] = i\n", " id_to_tag[i] = tag\n", " id_to_description[i] = description" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "cellId": "pplhqb4oesl6pq0w2pr3fe" }, "outputs": [], "source": [ "num_tags = len(possible_tags)" ] }, { "cell_type": "code", "execution_count": 328, "metadata": { "cellId": "vzlkypl5cyl3nagb85pphc" }, "outputs": [], "source": [ "with open('tags.txt', 'w') as file:\n", " for i in range(num_tags):\n", " file.write(str(id_to_tag[i]) + ' ' + str(id_to_description[i]) + '\\n')" ] }, { "cell_type": "markdown", "metadata": { "cellId": "vsiuhkjhmqkygl63n2pk9" }, "source": [ "И переведем теги в индексы." ] }, { "cell_type": "code", "execution_count": 329, "metadata": { "cellId": "m26ffr9sr4rc00y2kupo8q" }, "outputs": [], "source": [ "for i, line in enumerate(tags):\n", " line = list(set(line) & possible_tags)\n", " int_line = []\n", " for tag in line:\n", " int_line.append(tag_to_id[tag])\n", " tags[i] = int_line" ] }, { "cell_type": "markdown", "metadata": { "cellId": "jgvhsk815g9k8wu66ftx" }, "source": [ "Разобъем данные на трейн и тест..." ] }, { "cell_type": "code", "execution_count": 330, "metadata": { "cellId": "icw5n8u2se021wsdwlwio2" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "titles_train, titles_test, summaries_train, summaries_test, tags_train, tags_test = train_test_split(titles, summaries, tags, test_size=0.1, random_state=42)" ] }, { "cell_type": "markdown", "metadata": { "cellId": "rs6f6fyz5nn8x8ycz12m" }, "source": [ "И создадим датасеты и дадалоадеры." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "cellId": "9mji2qz8u8q79ays6y1j6" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d7c008df0c254cefbd8d2402ae5020ae", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=28.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b1e94a7ab27e40a9967f12f7011076c7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=483.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c676bcfd5d8a49e9bb8b5c35ca64cd33", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dc4baeeb13f04f5da7bc57094aa0828f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466062.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] } ], "source": [ "#!g1.1\n", "from transformers import AutoTokenizer, AutoModel, pipeline, DistilBertForSequenceClassification\n", "\n", "base_model_name = 'distilbert-base-uncased'\n", "base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", "#base_model = AutoModel.from_pretrained(base_model_name).to('cuda')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "cellId": "1cumn09ceki8g3916s4e4" }, "outputs": [ { "data": { "text/plain": [ "('./tokenizer/tokenizer_config.json',\n", " './tokenizer/special_tokens_map.json',\n", " './tokenizer/vocab.txt',\n", " './tokenizer/added_tokens.json',\n", " './tokenizer/tokenizer.json')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "base_tokenizer.save_pretrained('./tokenizer')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "cellId": "3rka3rozlhe1pwg0ewqvfs" }, "outputs": [], "source": [ "def multi_hot(indexes, num_classes):\n", " return torch.zeros((num_classes,)).scatter_(0, torch.tensor(indexes), torch.ones((num_classes,)))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "cellId": "ieptlunauzkup3dj55hvnc" }, "outputs": [], "source": [ "from torch.utils.data import Dataset, DataLoader\n", "\n", "class ArXivDataset(Dataset):\n", " def __init__(self, titles, abstracts, tags, possible_tags, tokenizer):\n", " super(ArXivDataset, self).__init__()\n", " self. titles = titles\n", " self.abstracts = abstracts\n", " self.tags = tags\n", " self.possible_tags = possible_tags\n", " self.num_classes = len(self.possible_tags)\n", " self.tokenizer = tokenizer\n", " \n", " self.inputs = ['Title: ' + self.titles[idx] + ' Abstract: ' + self.abstracts[idx] for idx in range(len(titles))]\n", " self.inputs = self.tokenizer(self.inputs, truncation=True, padding='max_length', max_length=512, return_tensors='pt')['input_ids']\n", " \n", " def __len__(self):\n", " return len(self.titles)\n", " \n", " def __getitem__(self, idx):\n", " return (self.inputs[idx], multi_hot(self.tags[idx], self.num_classes) / len(self.tags[idx]))" ] }, { "cell_type": "code", "execution_count": 334, "metadata": { "cellId": "vapsctsu94mjwcjydp4al" }, "outputs": [], "source": [ "#!g1.1\n", "base_train_ds = ArXivDataset(titles_train, summaries_train, tags_train, possible_tags, base_tokenizer)\n", "base_test_ds = ArXivDataset(titles_test, summaries_test, tags_test, possible_tags, base_tokenizer)\n", "\n", "base_train_dl = DataLoader(base_train_ds, shuffle=True, \n", " batch_size=128, num_workers=0)\n", "base_test_dl = DataLoader(base_test_ds, shuffle=True, \n", " batch_size=128, num_workers=0)\n", "\n", "small_ds = ArXivDataset(titles_train[:256], summaries_train[:256], tags_train[:256], possible_tags, base_tokenizer)\n", "small_dl = DataLoader(small_ds, shuffle=True, \n", " batch_size=128, num_workers=0)" ] }, { "cell_type": "markdown", "metadata": { "cellId": "61ne5cjsd7uuxa67duqn1g" }, "source": [ "Следующий класс в итоге не используется, мы будем использовать встроенную архитектуру головы классификатора." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "cellId": "wmqvly8ahucv5xfnudiaj" }, "outputs": [], "source": [ "#!g1.1\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "class BaseArXivClassifier(nn.Module):\n", " def __init__(self):\n", " super(BaseArXivClassifier, self).__init__()\n", " self.head = nn.Sequential(\n", " nn.Flatten(),\n", " nn.Linear(512*768, 155),\n", " nn.Softmax(dim=1)\n", " )\n", " \n", " def forward(self, X):\n", " with torch.no_grad():\n", " X = base_model(X).last_hidden_state\n", " X = self.head(X)\n", " return X" ] }, { "cell_type": "markdown", "metadata": { "cellId": "mavysgsu3gr19liok5m4p3" }, "source": [ "Определим функции для тренировки и отрисовки графиков" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "cellId": "5dbnf9xl7hif6k1uzavgjr" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "def plot_learning_process(train_loss, val_loss): \n", " plt.figure()\n", " plt.title('loss by epoch')\n", " plt.plot(np.arange(0, len(val_loss))+0.5, train_loss, label='train')\n", " plt.plot(np.arange(0, len(val_loss))+1, val_loss, label='val')\n", " plt.legend()\n", " plt.grid()\n", " plt.xlabel('epoch')\n", " plt.ylabel('loss function')\n", " plt.show()\n", " " ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "cellId": "441ify9vq4qumr997qqum" }, "outputs": [], "source": [ "#!g1.1\n", "\n", "device = 'cuda'\n", "\n", "import IPython\n", "from math import ceil\n", "\n", "\n", "def train_loop(model, dataloader, loss_fn, optimizer, step=0.05, history_loss=None):\n", " out = display(IPython.display.Pretty('Learning...'), display_id=True)\n", " \n", " size = len(dataloader.dataset) \n", " len_size = len(str(size))\n", " batches = ceil(size / dataloader.batch_size) - 1\n", " \n", " train_loss = []\n", " percentage = 0\n", " for batch, (X, y) in enumerate(tqdm(dataloader, leave=False, desc=\"Batch #\")):\n", " X, y = X.to(device), y.to(device)\n", " # evaluate\n", " output = F.softmax(model(X).logits, dim=1)\n", " loss = loss_fn(output, y)\n", " train_loss.append(loss.item())\n", " \n", " # backpropagation\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", " # print info\n", " if batch / batches > percentage or batch == batches: \n", " out.update(f'[{int(percentage * size)}/{size}] Loss: {train_loss[-1]:>8f}')\n", " percentage += step\n", " \n", " if history_loss is not None:\n", " history_loss.append(np.mean(train_loss))\n", "\n", " return {'train_loss': np.mean(train_loss)}\n", " \n", "def test_loop(model, dataloader, loss_fn, history_loss=None, history_acc=None):\n", "\n", " size = len(dataloader.dataset)\n", " test_loss, correct = 0, 0\n", " batches = ceil(size / dataloader.batch_size)\n", "\n", " val_loss = []\n", " \n", " with torch.no_grad():\n", " # evalute and check predictions\n", " for batch, (X, y) in enumerate(tqdm(dataloader, leave=False, desc='Batch #')):\n", " X, y = X.to(device), y.to(device)\n", " output = F.softmax(model(X).logits, dim=1)\n", " loss = loss_fn(output, y)\n", " test_loss += loss.item()\n", " \n", " val_loss.append(loss.item())\n", " \n", " test_loss /= batches\n", " correct /= size\n", " \n", " print(f\"Validation accuracy: {(100*correct):>0.1f}%, Validation loss: {test_loss:>8f} \\n\")\n", " \n", " if history_loss is not None:\n", " history_loss.append(np.mean(val_loss))\n", " \n", " return {'val_loss': np.mean(val_loss)}" ] }, { "cell_type": "markdown", "metadata": { "cellId": "eihlww9xq7sp8o0j4al6ir" }, "source": [ "И натренируем нашу базовую модель." ] }, { "cell_type": "code", "execution_count": 338, "metadata": { "cellId": "vspefy8lcknk40w5yxt", "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ea9879f7935b4807a7d5bd38c183a3ef", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=483.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b7227056dcac4587b0eac589ee7b0f51", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=267967963.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']\n", "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Epoch 1\n", "-------------------------------\n" ] }, { "data": { "text/plain": [ "Learning..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "907aa42b0a7746b0898b9bd46a9e0cf2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Batch #'), FloatProgress(value=0.0, max=289.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[0/36900] Loss: 0.038880'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[1845/36900] Loss: 0.034217'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[3690/36900] Loss: 0.029051'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[5535/36900] Loss: 0.021646'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[7380/36900] Loss: 0.022256'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[9225/36900] Loss: 0.023418'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[11070/36900] Loss: 0.021545'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[12915/36900] Loss: 0.021974'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[14759/36900] Loss: 0.021328'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[16605/36900] Loss: 0.020796'" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Epoch {epoch+1}\\n-------------------------------\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mtrain_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_train_dl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhistory_loss\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_loss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0mtest_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_test_dl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhistory_loss\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_loss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_loop\u001b[0;34m(model, dataloader, loss_fn, optimizer, step, history_loss)\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msoftmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloss_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mtrain_loss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# backpropagation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "#!g1.1\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "from IPython.display import clear_output\n", "\n", "classifier = DistilBertForSequenceClassification.from_pretrained(base_model_name, num_labels=num_tags).to('cuda')\n", "for param in classifier.base_model.parameters():\n", " param.requires_grad = False\n", "\n", "classifier.train()\n", "loss_fn = F.binary_cross_entropy\n", "optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)\n", "epochs = 30\n", "\n", "train_loss = []\n", "val_loss = []\n", " \n", "for epoch in range(epochs):\n", " print(f\"Epoch {epoch+1}\\n-------------------------------\")\n", " \n", " train_loop(classifier, base_train_dl, loss_fn, optimizer, history_loss=train_loss)\n", " test_loop(classifier, base_test_dl, loss_fn, history_loss=val_loss)\n", " \n", " clear_output()\n", " plot_learning_process(train_loss, val_loss)" ] }, { "attachments": { "72b0e3ec-070b-4ffe-a261-f72c4a5740fe.png": { "image/png": "" } }, "cell_type": "markdown", "metadata": { "cellId": "a3sc3gned3uygxq5ywd529" }, "source": [ "Прошу прощения, исходный output ячейки потерялся, потому что я случайно перезапустил эту ячейку когда тренировал следующую версию модели. Тем не менее, у меня сохранился график обучения, который я прилагаю здесь.\n", "![image.png](attachment:72b0e3ec-070b-4ffe-a261-f72c4a5740fe.png)" ] }, { "cell_type": "markdown", "metadata": { "cellId": "gkqtjsnon6biela1hxs3z" }, "source": [ "Сохраним ее чтобы иметь возможность использовать ее в интерфейсе." ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "cellId": "pweno0g9r3w2a22ziovd" }, "outputs": [], "source": [ "#!g1.1\n", "classifier.save_pretrained('./models')" ] }, { "cell_type": "markdown", "metadata": { "cellId": "tiegy2nj0ccm3c8ec8a2jj" }, "source": [ "Теперь попробуем повторить историю со специализированной моделью той же архитктуры, но предобученной на научных статьях." ] }, { "cell_type": "code", "execution_count": 312, "metadata": { "cellId": "my2w2e2vxpnb396jrppeoa" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "516dd347c0544191b3e448f1467ccb70", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=385.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "70064bb0aef5416db5980f8cca51fdae", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=227845.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] } ], "source": [ "#!g1.1\n", "sci_model_name = 'allenai/scibert_scivocab_uncased'\n", "sci_tokenizer = AutoTokenizer.from_pretrained(sci_model_name)" ] }, { "cell_type": "code", "execution_count": 317, "metadata": { "cellId": "sclvuck179m0oe02zkimp5" }, "outputs": [], "source": [ "#!g1.1\n", "sci_train_ds = ArXivDataset(titles_train, summaries_train, tags_train, possible_tags, sci_tokenizer)\n", "sci_test_ds = ArXivDataset(titles_test, summaries_test, tags_test, possible_tags, sci_tokenizer)\n", "\n", "sci_train_dl = DataLoader(sci_train_ds, shuffle=True, \n", " batch_size=256, num_workers=0)\n", "sci_test_dl = DataLoader(sci_test_ds, shuffle=True, \n", " batch_size=256, num_workers=0)" ] }, { "cell_type": "code", "execution_count": 318, "metadata": { "cellId": "44aiscms3qxnuo03p2puv9" }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 7\n", "-------------------------------\n" ] }, { "data": { "text/plain": [ "Learning..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "59aed5898485404a8a18eb9f6300eadc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value='Batch #'), FloatProgress(value=0.0, max=145.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[0/36900] Loss: 0.021270'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[1845/36900] Loss: 0.021709'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[3690/36900] Loss: 0.021914'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[5535/36900] Loss: 0.021512'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[7380/36900] Loss: 0.020658'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[9225/36900] Loss: 0.021273'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[11070/36900] Loss: 0.021641'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[12915/36900] Loss: 0.020861'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[14759/36900] Loss: 0.021167'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[16605/36900] Loss: 0.021825'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[18449/36900] Loss: 0.021072'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[20294/36900] Loss: 0.021985'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[22140/36900] Loss: 0.020836'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[23985/36900] Loss: 0.022309'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[25830/36900] Loss: 0.021078'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'[27675/36900] Loss: 0.021097'" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Epoch {epoch+1}\\n-------------------------------\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mtrain_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msci_classifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msci_train_dl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhistory_loss\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_loss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mtest_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msci_classifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msci_test_dl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhistory_loss\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_loss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_loop\u001b[0;34m(model, dataloader, loss_fn, optimizer, step, history_loss)\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "#!g1.1\n", "from transformers import AutoModelForSequenceClassification\n", "sci_classifier = AutoModelForSequenceClassification.from_pretrained(sci_model_name, num_labels=num_tags).to('cuda')\n", "for param in sci_classifier.base_model.parameters():\n", " param.requires_grad = False\n", "\n", "sci_classifier.train()\n", "loss_fn = F.binary_cross_entropy\n", "optimizer = torch.optim.Adam(sci_classifier.parameters(), lr=1e-4)\n", "epochs = 30\n", "\n", "train_loss = []\n", "val_loss = []\n", " \n", "for epoch in range(epochs):\n", " print(f\"Epoch {epoch+1}\\n-------------------------------\")\n", " \n", " train_loop(sci_classifier, sci_train_dl, loss_fn, optimizer, history_loss=train_loss)\n", " test_loop(sci_classifier, sci_test_dl, loss_fn, history_loss=val_loss)\n", " \n", " clear_output()\n", " plot_learning_process(train_loss, val_loss)" ] }, { "cell_type": "markdown", "metadata": { "cellId": "4tyrvht7r3rk4dny2oyk5" }, "source": [ "Видим, что разницы в результатах не заметно, поэтому остановим обучение." ] }, { "cell_type": "markdown", "metadata": { "cellId": "cmyx82lx7eixgamzj6jbza" }, "source": [ "Теперь попробуем датасет побольше (существенно побольше). Он взят [отсюда](https://www.kaggle.com/datasets/Cornell-University/arxiv?select=arxiv-metadata-oai-snapshot.json). Мы используем только часть датасета (100'000 статей)." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "cellId": "4p7b0qtd94vy281idav2o" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1df6c68fe89c4f9aab024a5508bfb48a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "big_data = []\n", "with open('arxiv-metadata-oai-snapshot.json') as big_file:\n", " for line in tqdm(big_file):\n", " big_data.append(json.loads(line))\n", " if len(big_data) >= 10**5:\n", " break" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "cellId": "dxhbuir0c2iux8wfptpc" }, "outputs": [], "source": [ "def trl(container):\n", " return tqdm(range(len(container)))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "cellId": "dyijqg6hqaoorj7jqdj3hj" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8ee2a9096f28450d9a0da04c0455678c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "741143f6b10b4d2898bca8a4caf91227", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8777fbb3bc8c4d85851d55fa44dea200", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "titles = [big_data[i]['title'] for i in trl(big_data)]\n", "summaries = [big_data[i]['abstract'] for i in trl(big_data)]\n", "tags = [big_data[i]['categories'].split() for i in trl(big_data)]" ] }, { "cell_type": "markdown", "metadata": { "cellId": "ne6lc68j6kry5of6x805d" }, "source": [ "По каким-то причинам небольшая доля статей в этих данных не имеет ни одного тега из списка из 155 тегов, вытащенных нами с официальной страницы арксива. Чтобы избавиться от ситуации, когда у примера \"нет правильных ответов\" мы добавляем особый тег None, отвечающий за эту ситуацию. Разумеется, нам придется позже отфильтровать эту \"заглушку\" в пользовательской выдаче результатов. На практике, модель почти не пользуется этим тегом и предсказывает его очень редко и очень глубоко в списке тегов." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "cellId": "175cqj7ibqujnrz43e7vhq" }, "outputs": [], "source": [ "tag_to_id['None'] = 155\n", "id_to_tag[155] = 'None'\n", "id_to_description[155] = 'No tag'\n", "possible_tags.add('None')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "cellId": "z8tqaxe8t33akw7spxi3o" }, "outputs": [], "source": [ "for i, line in enumerate(tags):\n", " line = list(set(line) & possible_tags)\n", " int_line = []\n", " for tag in line:\n", " int_line.append(tag_to_id[tag])\n", " if len(int_line) == 0:\n", " int_line.append(155)\n", " tags[i] = int_line" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "cellId": "5iqci44rvbpzh2vnulsd1s" }, "outputs": [], "source": [ "#!g1.1\n", "from sklearn.model_selection import train_test_split\n", "\n", "titles_train, titles_test, summaries_train, summaries_test, tags_train, tags_test = train_test_split(titles, summaries, tags, test_size=0.01, random_state=42)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "cellId": "od5nv2zwd3t0mjtp4qhdl6d" }, "outputs": [], "source": [ "#!g1.1\n", "big_train_ds = ArXivDataset(titles_train, summaries_train, tags_train, possible_tags, base_tokenizer)\n", "big_test_ds = ArXivDataset(titles_test, summaries_test, tags_test, possible_tags, base_tokenizer)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "cellId": "wu358amm229d41o53r1ys" }, "outputs": [], "source": [ "#!g1.1\n", "big_train_dl = DataLoader(big_train_ds, shuffle=True, \n", " batch_size=128, num_workers=0)\n", "big_test_dl = DataLoader(big_test_ds, shuffle=True, \n", " batch_size=128, num_workers=0)\n" ] }, { "cell_type": "markdown", "metadata": { "cellId": "y0rmkjb6ebf70313pjon" }, "source": [ "Натренируем классификатор:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "cellId": "q6v6a9sq9xl3fobim9swpf" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']\n", "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "#!g1.1\n", "classifier = DistilBertForSequenceClassification.from_pretrained(base_model_name, num_labels=num_tags + 1).to('cuda')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "cellId": "mb6xu490zup1awasmer9t3" }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#!g1.1\n", "\n", "from IPython.display import clear_output\n", "\n", "for param in classifier.base_model.parameters():\n", " param.requires_grad = False\n", "\n", "classifier.train()\n", "loss_fn = F.binary_cross_entropy\n", "optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)\n", "epochs = 10\n", "\n", "train_loss = []\n", "val_loss = []\n", " \n", "for epoch in range(epochs):\n", " print(f\"Epoch {epoch+1}\\n-------------------------------\")\n", " \n", " train_loop(classifier, big_train_dl, loss_fn, optimizer, history_loss=train_loss)\n", " test_loop(classifier, big_test_dl, loss_fn, history_loss=val_loss)\n", " \n", " clear_output()\n", " plot_learning_process(train_loss, val_loss)" ] }, { "cell_type": "markdown", "metadata": { "cellId": "tr49elk25f4hlsochz07w" }, "source": [ "Видно, что 10 эпох недостаточно и хотелось бы потренироваться еще немного. Так и сделаем (предварительно сохранив классификатор на всякий случай):" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "cellId": "xwtyveu2c3zuzq5eacqs" }, "outputs": [], "source": [ "#!g1.1\n", "classifier.save_pretrained('./big_models')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "cellId": "wscjyljcn4xakf6wku95s" }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#!g1.1\n", "epochs = 10\n", "\n", "for epoch in range(epochs):\n", " print(f\"Epoch {epoch+1}\\n-------------------------------\")\n", " \n", " train_loop(classifier, big_train_dl, loss_fn, optimizer, history_loss=train_loss)\n", " test_loop(classifier, big_test_dl, loss_fn, history_loss=val_loss)\n", " \n", " clear_output()\n", " plot_learning_process(train_loss, val_loss)" ] }, { "cell_type": "markdown", "metadata": { "cellId": "92tr8y80qhn03ukqzs2f3so" }, "source": [ "Эта версия используется в финальном сервисе, присланном в качестве решения.\n", "\n", "Далее мы вводим несколько функции для обработки результатов и для визуального контроля качества модели." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "cellId": "kbw34ornw4l2h32kbi3g" }, "outputs": [], "source": [ "#!g1.1\n", "\n", "def top_xx(preds, xx=95):\n", " tops = torch.argsort(preds, 1, descending=True)\n", " total = 0\n", " index = 0\n", " result = []\n", " while total < xx / 100:\n", " next_id = tops[0, index].item()\n", " if next_id == 155:\n", " continue\n", " total += preds[0, next_id]\n", " index += 1\n", " result.append({'tag': id_to_tag[next_id], 'description': id_to_description[next_id]})\n", " return result\n", "\n", "def print_data(idx):\n", " print('Title:')\n", " print(big_test_ds.titles[idx])\n", " print('\\nAbstract:')\n", " print(big_test_ds.abstracts[idx])\n", "\n", "def correct_classes(idx):\n", " return [id_to_tag[i] for i in big_test_ds.tags[idx]]\n", "\n", "def top_xx_guesses(model, idx, xx):\n", " print(type(model(big_test_ds[idx][0].to('cuda').reshape(1, -1))))\n", " preds = F.softmax(model(big_test_ds[idx][0].to('cuda').reshape(1, -1)).logits, dim=1)\n", " return [tag['tag'] for tag in top_xx(preds, xx)]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "cellId": "h5gkji1otgnc2l8qytswhf" }, "outputs": [], "source": [ "#!g1.1\n", "def press_x_to_win(idx, xx):\n", " print_data(idx)\n", " print('CORRECT:', correct_classes(idx))\n", " print('GUESSED:', top_xx_guesses(classifier, idx, xx))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "cellId": "jligt6ypw4g9xq4m4odur" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Title:\n", "Light Bosons of Electromagnetic Field and Breakdown of Relativistic\n", " Theory\n", "\n", "Abstract:\n", " In our analysis, a quantisation scheme for local electromagnetic waves in\n", "vacuum is introduced by the model of nonideal Bose-gas consisting of\n", "Bose-light-particles (which are no photons) with spin one and a finite mass.\n", "This fact destroys the Relativistic Theory of Einstein as well as displays a\n", "wrong sound of so-called a spontaneous breakdown of symmetry because the light\n", "boson can be moved by speed of wave in vacuum.\n", "\n", "CORRECT: ['physics.gen-ph']\n", "\n", "GUESSED: ['quant-ph', 'hep-th', 'hep-ph', 'cond-mat.mes-hall', 'gr-qc', 'physics.gen-ph', 'math-ph', 'math.MP', 'math.DG', 'math.AG']\n" ] } ], "source": [ "#!g1.1\n", "press_x_to_win(3, 70)" ] } ], "metadata": { "kernelspec": { "display_name": "Yandex DataSphere Kernel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "notebookId": "23a7be32-1075-460e-a712-a538c2e12f04", "notebookPath": "Transformers DevOps/models.ipynb" }, "nbformat": 4, "nbformat_minor": 4 }