{ "cells": [ { "cell_type": "code", "execution_count": 329, "id": "6cf95722", "metadata": { "cellId": "eziodlb8kics09v3tpfeks" }, "outputs": [], "source": [ "#!g1.1\n", "from sklearn.preprocessing import LabelEncoder\n", "import transformers\n", "import torch\n", "import nltk\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 330, "id": "c692d165", "metadata": { "cellId": "lvwy4cb1dnfnk3n391yiq" }, "outputs": [], "source": [ "#!g1.1\n", "df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines = True)" ] }, { "cell_type": "code", "execution_count": 331, "id": "9b51b145", "metadata": { "cellId": "l71bdxo21obg5fforh5ppi" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubmitterauthorstitlecommentsjournal-refdoireport-nocategorieslicenseabstractversionsupdate_dateauthors_parsed
00704.0001Pavel NadolskyC. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...Calculation of prompt diphoton production cros...37 pages, 15 figures; published versionPhys.Rev.D76:013009,200710.1103/PhysRevD.76.013009ANL-HEP-PR-07-12hep-phNoneA fully differential calculation in perturba...[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...2008-11-26[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...
10704.0002Louis TheranIleana Streinu and Louis TheranSparsity-certifying Graph DecompositionsTo appear in Graphs and CombinatoricsNoneNoneNonemath.CO cs.CGhttp://arxiv.org/licenses/nonexclusive-distrib...We describe a new algorithm, the $(k,\\ell)$-...[{'version': 'v1', 'created': 'Sat, 31 Mar 200...2008-12-13[[Streinu, Ileana, ], [Theran, Louis, ]]
20704.0003Hongjun PanHongjun PanThe evolution of the Earth-Moon system based o...23 pages, 3 figuresNoneNoneNonephysics.gen-phNoneThe evolution of Earth-Moon system is descri...[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...2008-01-13[[Pan, Hongjun, ]]
30704.0004David CallanDavid CallanA determinant of Stirling cycle numbers counts...11 pagesNoneNoneNonemath.CONoneWe show that a determinant of Stirling cycle...[{'version': 'v1', 'created': 'Sat, 31 Mar 200...2007-05-23[[Callan, David, ]]
40704.0005Alberto TorchinskyWael Abu-Shammala and Alberto TorchinskyFrom dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...NoneIllinois J. Math. 52 (2008) no.2, 681-689NoneNonemath.CA math.FANoneIn this paper we show how to compute the $\\L...[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...2013-10-15[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]
\n", "
" ], "text/plain": [ " id submitter \\\n", "0 0704.0001 Pavel Nadolsky \n", "1 0704.0002 Louis Theran \n", "2 0704.0003 Hongjun Pan \n", "3 0704.0004 David Callan \n", "4 0704.0005 Alberto Torchinsky \n", "\n", " authors \\\n", "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \n", "1 Ileana Streinu and Louis Theran \n", "2 Hongjun Pan \n", "3 David Callan \n", "4 Wael Abu-Shammala and Alberto Torchinsky \n", "\n", " title \\\n", "0 Calculation of prompt diphoton production cros... \n", "1 Sparsity-certifying Graph Decompositions \n", "2 The evolution of the Earth-Moon system based o... \n", "3 A determinant of Stirling cycle numbers counts... \n", "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n", "\n", " comments \\\n", "0 37 pages, 15 figures; published version \n", "1 To appear in Graphs and Combinatorics \n", "2 23 pages, 3 figures \n", "3 11 pages \n", "4 None \n", "\n", " journal-ref doi \\\n", "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 Illinois J. Math. 52 (2008) no.2, 681-689 None \n", "\n", " report-no categories \\\n", "0 ANL-HEP-PR-07-12 hep-ph \n", "1 None math.CO cs.CG \n", "2 None physics.gen-ph \n", "3 None math.CO \n", "4 None math.CA math.FA \n", "\n", " license \\\n", "0 None \n", "1 http://arxiv.org/licenses/nonexclusive-distrib... \n", "2 None \n", "3 None \n", "4 None \n", "\n", " abstract \\\n", "0 A fully differential calculation in perturba... \n", "1 We describe a new algorithm, the $(k,\\ell)$-... \n", "2 The evolution of Earth-Moon system is descri... \n", "3 We show that a determinant of Stirling cycle... \n", "4 In this paper we show how to compute the $\\L... \n", "\n", " versions update_date \\\n", "0 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2008-11-26 \n", "1 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2008-12-13 \n", "2 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2008-01-13 \n", "3 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2007-05-23 \n", "4 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2013-10-15 \n", "\n", " authors_parsed \n", "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... \n", "1 [[Streinu, Ileana, ], [Theran, Louis, ]] \n", "2 [[Pan, Hongjun, ]] \n", "3 [[Callan, David, ]] \n", "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] " ] }, "execution_count": 331, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 332, "id": "891dfcce", "metadata": { "cellId": "7769ktrdya6ae568sf1rk" }, "outputs": [ { "data": { "text/plain": [ "'math.NT'" ] }, "execution_count": 332, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "str(df[10:11]['categories']).split()[1]" ] }, { "cell_type": "code", "execution_count": 333, "id": "833b4037", "metadata": { "cellId": "sj1efyz6sjgr20rhngooc" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubmitterauthorstitlecommentsjournal-refdoireport-nocategorieslicenseabstractversionsupdate_dateauthors_parsedtag
00704.0001Pavel NadolskyC. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...Calculation of prompt diphoton production cros...37 pages, 15 figures; published versionPhys.Rev.D76:013009,200710.1103/PhysRevD.76.013009ANL-HEP-PR-07-12hep-phNoneA fully differential calculation in perturba...[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...2008-11-26[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...hep-ph
10704.0002Louis TheranIleana Streinu and Louis TheranSparsity-certifying Graph DecompositionsTo appear in Graphs and CombinatoricsNoneNoneNonemath.CO cs.CGhttp://arxiv.org/licenses/nonexclusive-distrib...We describe a new algorithm, the $(k,\\ell)$-...[{'version': 'v1', 'created': 'Sat, 31 Mar 200...2008-12-13[[Streinu, Ileana, ], [Theran, Louis, ]]math
20704.0003Hongjun PanHongjun PanThe evolution of the Earth-Moon system based o...23 pages, 3 figuresNoneNoneNonephysics.gen-phNoneThe evolution of Earth-Moon system is descri...[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...2008-01-13[[Pan, Hongjun, ]]physics
30704.0004David CallanDavid CallanA determinant of Stirling cycle numbers counts...11 pagesNoneNoneNonemath.CONoneWe show that a determinant of Stirling cycle...[{'version': 'v1', 'created': 'Sat, 31 Mar 200...2007-05-23[[Callan, David, ]]math
40704.0005Alberto TorchinskyWael Abu-Shammala and Alberto TorchinskyFrom dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...NoneIllinois J. Math. 52 (2008) no.2, 681-689NoneNonemath.CA math.FANoneIn this paper we show how to compute the $\\L...[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...2013-10-15[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]math
\n", "
" ], "text/plain": [ " id submitter \\\n", "0 0704.0001 Pavel Nadolsky \n", "1 0704.0002 Louis Theran \n", "2 0704.0003 Hongjun Pan \n", "3 0704.0004 David Callan \n", "4 0704.0005 Alberto Torchinsky \n", "\n", " authors \\\n", "0 C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-... \n", "1 Ileana Streinu and Louis Theran \n", "2 Hongjun Pan \n", "3 David Callan \n", "4 Wael Abu-Shammala and Alberto Torchinsky \n", "\n", " title \\\n", "0 Calculation of prompt diphoton production cros... \n", "1 Sparsity-certifying Graph Decompositions \n", "2 The evolution of the Earth-Moon system based o... \n", "3 A determinant of Stirling cycle numbers counts... \n", "4 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n", "\n", " comments \\\n", "0 37 pages, 15 figures; published version \n", "1 To appear in Graphs and Combinatorics \n", "2 23 pages, 3 figures \n", "3 11 pages \n", "4 None \n", "\n", " journal-ref doi \\\n", "0 Phys.Rev.D76:013009,2007 10.1103/PhysRevD.76.013009 \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 Illinois J. Math. 52 (2008) no.2, 681-689 None \n", "\n", " report-no categories \\\n", "0 ANL-HEP-PR-07-12 hep-ph \n", "1 None math.CO cs.CG \n", "2 None physics.gen-ph \n", "3 None math.CO \n", "4 None math.CA math.FA \n", "\n", " license \\\n", "0 None \n", "1 http://arxiv.org/licenses/nonexclusive-distrib... \n", "2 None \n", "3 None \n", "4 None \n", "\n", " abstract \\\n", "0 A fully differential calculation in perturba... \n", "1 We describe a new algorithm, the $(k,\\ell)$-... \n", "2 The evolution of Earth-Moon system is descri... \n", "3 We show that a determinant of Stirling cycle... \n", "4 In this paper we show how to compute the $\\L... \n", "\n", " versions update_date \\\n", "0 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2008-11-26 \n", "1 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2008-12-13 \n", "2 [{'version': 'v1', 'created': 'Sun, 1 Apr 2007... 2008-01-13 \n", "3 [{'version': 'v1', 'created': 'Sat, 31 Mar 200... 2007-05-23 \n", "4 [{'version': 'v1', 'created': 'Mon, 2 Apr 2007... 2013-10-15 \n", "\n", " authors_parsed tag \n", "0 [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,... hep-ph \n", "1 [[Streinu, Ileana, ], [Theran, Louis, ]] math \n", "2 [[Pan, Hongjun, ]] physics \n", "3 [[Callan, David, ]] math \n", "4 [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]] math " ] }, "execution_count": 333, "metadata": {}, "output_type": "execute_result" }, { "name": "stderr", "output_type": "stream", "text": [ "/kernel/lib/python3.8/site-packages/ml_kernel/ignored_keyboard_interrupt.py:16: UserWarning: State committing stage cannot be interrupted. Please wait.\n", " warnings.warn(self._warn_message)\n" ] } ], "source": [ "#!g1.1\n", "def get_tag(row):\n", " try:\n", " return str(row).split()[0].split('.')[0]\n", " except Exception:\n", " pass\n", " \n", "\n", "df['tag'] = df['categories'].apply(lambda x: get_tag(x))\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 390, "id": "18bc627a", "metadata": { "cellId": "yo10sp842yn569jdjhn34c" }, "outputs": [], "source": [ "#!g1.1\n", "labels = []\n", "for index, row in df.iterrows():\n", " labels.append(row['tag'])\n", " " ] }, { "cell_type": "code", "execution_count": 391, "id": "6c522ad9", "metadata": { "cellId": "hpc6uu4tlccmq4o0dsx38c" }, "outputs": [], "source": [ "#!g1.1\n", "map_labels = dict()\n", "for label in labels:\n", " if label in map_labels:\n", " map_labels[label] += 1\n", " else:\n", " map_labels[label] = 1 " ] }, { "cell_type": "code", "execution_count": 397, "id": "43b9f3cb", "metadata": { "cellId": "lbwz1g67k60opqhyjm3v6b" }, "outputs": [ { "data": { "text/plain": [ "{'hep-ph': 120410,\n", " 'math': 414977,\n", " 'physics': 146186,\n", " 'cond-mat': 276428,\n", " 'gr-qc': 54599,\n", " 'astro-ph': 266321,\n", " 'hep-th': 96207,\n", " 'hep-ex': 20735,\n", " 'nlin': 17315,\n", " 'q-bio': 23287,\n", " 'quant-ph': 88923,\n", " 'cs': 338681,\n", " 'nucl-th': 30599,\n", " 'math-ph': 28805,\n", " 'hep-lat': 16623,\n", " 'nucl-ex': 10500,\n", " 'q-fin': 9131,\n", " 'stat': 37132,\n", " 'eess': 27531,\n", " 'econ': 3904,\n", " 'acc-phys': 46,\n", " 'adap-org': 306,\n", " 'alg-geom': 1209,\n", " 'ao-sci': 13,\n", " 'atom-ph': 68,\n", " 'bayes-an': 11,\n", " 'chao-dyn': 1770,\n", " 'chem-ph': 129,\n", " 'cmp-lg': 894,\n", " 'comp-gas': 140,\n", " 'dg-ga': 562,\n", " 'funct-an': 320,\n", " 'mtrl-th': 165,\n", " 'patt-sol': 452,\n", " 'plasm-ph': 28,\n", " 'q-alg': 1177,\n", " 'solv-int': 844,\n", " 'supr-con': 69}" ] }, "execution_count": 397, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "map_labels" ] }, { "cell_type": "code", "execution_count": 398, "id": "7e8b9e14", "metadata": { "cellId": "0xg7kt36566l12t6gavfeoq" }, "outputs": [], "source": [ "#!g1.1\n", "dict_label = {'math': 4,\n", " 'physics': 5,\n", " 'q-bio': 6,\n", " 'cs': 1,\n", " 'q-fin': 7,\n", " 'stat': 8,\n", " 'eess': 3,\n", " 'econ': 2}" ] }, { "cell_type": "code", "execution_count": 399, "id": "39bfd5dc", "metadata": { "cellId": "twjsbrjsiloyjbah2fbtm" }, "outputs": [], "source": [ "#!g1.1\n", "new_map_labels = dict()\n", "for key, value in map_labels.items():\n", " if key in dict_label:\n", " new_map_labels[key] = value " ] }, { "cell_type": "code", "execution_count": 400, "id": "3eca80b5", "metadata": { "cellId": "9qzelfdhdtgizy330a4jzi" }, "outputs": [ { "data": { "text/plain": [ "{'math': 414977,\n", " 'physics': 146186,\n", " 'q-bio': 23287,\n", " 'cs': 338681,\n", " 'q-fin': 9131,\n", " 'stat': 37132,\n", " 'eess': 27531,\n", " 'econ': 3904}" ] }, "execution_count": 400, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "new_map_labels" ] }, { "cell_type": "code", "execution_count": 401, "id": "eaa44619", "metadata": { "cellId": "9o0bhawr3id19hxhy7m4xt" }, "outputs": [ { "data": { "text/plain": [ "59904" ] }, "execution_count": 401, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "sum_value = 0\n", "for key, value in new_map_labels.items():\n", " sum_value += min(value, 8000) \n", " new_map_labels[key] = min(value, 8000) \n", "sum_value" ] }, { "cell_type": "code", "execution_count": 405, "id": "8b5d06a1", "metadata": { "cellId": "eyq1ocx4irpx2lplb54qu" }, "outputs": [], "source": [ "#!g1.1 \n", "small_df = pd.DataFrame()\n", "\n", "for idx, row in df.iterrows():\n", " if row['tag'] in new_map_labels:\n", " \n", " if new_map_labels[row['tag']] > 0:\n", " new_map_labels[row['tag']] -= 1\n", "\n", " small_df = small_df.append(row, ignore_index=True)\n", " " ] }, { "cell_type": "code", "execution_count": 406, "id": "f0d17685", "metadata": { "cellId": "1xwjz5xpl4hfeu6hcby3sw" }, "outputs": [ { "data": { "text/plain": [ "(59904, 15)" ] }, "execution_count": 406, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "small_df.shape" ] }, { "cell_type": "code", "execution_count": 407, "id": "0726c5ad", "metadata": { "cellId": "0hj81qc6cxmjk68l9h30wu" }, "outputs": [], "source": [ "#!g1.1\n", "small_df[\"text\"] = small_df[\"title\"] + \". \" + small_df[\"abstract\"]\n", "small_df[\"text\"] = small_df[\"text\"].map(lambda x : x.replace(\"\\n\", \" \"))\n", "small_df[\"text\"] = small_df[\"text\"].str.lower()\n" ] }, { "cell_type": "code", "execution_count": 428, "id": "85a623dd", "metadata": { "cellId": "cy86suh5aidkyhqe8g9l8" }, "outputs": [], "source": [ "#!g1.1\n", "for idx, row in small_df.iterrows():\n", " row['categories'] = dict_label[row['tag']] - 1\n", " \n", "small_df['label'] = small_df['categories']" ] }, { "cell_type": "code", "execution_count": 435, "id": "1dbb6b66", "metadata": { "cellId": "j8fduhc4vzcf23ghd01uh" }, "outputs": [], "source": [ "#!g1.1\n", "data = small_df[['text', 'label']]" ] }, { "cell_type": "code", "execution_count": 440, "id": "0ec4a8d7", "metadata": { "cellId": "axjkoz0rhguz42agwj0azs" }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 440, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "sum(data['label'] == 8)" ] }, { "cell_type": "code", "execution_count": 409, "id": "1b0209aa", "metadata": { "cellId": "cr69xpmgajao3i52mfmsw" }, "outputs": [], "source": [ "#!g1.1\n", "dict_label_map = dict()\n", "for idx, row in small_df.iterrows():\n", " if row['tag'] in dict_label_map:\n", " dict_label_map[row['tag']] += 1\n", " else:\n", " dict_label_map[row['tag']] = 1" ] }, { "cell_type": "code", "execution_count": 410, "id": "e752a6ab", "metadata": { "cellId": "475aq0jltlwie0y5rce39f" }, "outputs": [ { "data": { "text/plain": [ "{'math': 8000,\n", " 'physics': 8000,\n", " 'q-bio': 8000,\n", " 'cs': 8000,\n", " 'q-fin': 8000,\n", " 'stat': 8000,\n", " 'eess': 8000,\n", " 'econ': 3904}" ] }, "execution_count": 410, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!g1.1\n", "dict_label_map" ] }, { "cell_type": "code", "execution_count": 441, "id": "f01b5e91", "metadata": { "cellId": "zvkn5omccecd93rwwd4v4" }, "outputs": [], "source": [ "#!g1.1\n", "from sklearn.model_selection import train_test_split\n", "train_data, test_data = train_test_split(data, test_size=0.2)\n", "train_data, val_data = train_test_split(data, test_size=0.25)" ] }, { "cell_type": "code", "execution_count": 442, "id": "ee989ca4", "metadata": { "cellId": "xlfdta76w5j3xypjxddpl" }, "outputs": [], "source": [ "#!g1.1\n", "train_data.to_csv('train_data.csv', index=None)\n", "val_data.to_csv('val_data.csv', index=None)\n", "test_data.to_csv('test_data.csv', index=None)" ] }, { "cell_type": "code", "execution_count": 414, "id": "f998553d", "metadata": { "cellId": "ftho4ti37uey8j3jmeqjk" }, "outputs": [], "source": [ "#!g1.1\n", "# тут датасфера не дает мне нормальной возможности работать с токенизатором, поэтому перейдем в другой ноутбук где и продолжим " ] } ], "metadata": { "kernelspec": { "display_name": "Yandex DataSphere Kernel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "notebookId": "8cdde807-2f63-4837-814d-b292cfd142b6", "notebookPath": "Untitled (1).ipynb" }, "nbformat": 4, "nbformat_minor": 5 }