diff --git "a/src/patent_train.ipynb" "b/src/patent_train.ipynb" --- "a/src/patent_train.ipynb" +++ "b/src/patent_train.ipynb" @@ -1,1643 +1 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Harvard USPTO Dataset Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Importing Packages\n", - "\n", - "We first need to import the actual USPTO dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: datasets in /opt/conda/lib/python3.10/site-packages (2.11.0)\n", - "Requirement already satisfied: fsspec[http]>=2021.11.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (2022.8.2)\n", - "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.23.3)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.64.1)\n", - "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.28.1)\n", - "Requirement already satisfied: pyarrow>=8.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (9.0.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0)\n", - "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (1.5.0)\n", - "Requirement already satisfied: huggingface-hub<1.0.0,>=0.11.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.13.4)\n", - "Requirement already satisfied: responses<0.19 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.18.0)\n", - "Requirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets) (3.2.0)\n", - "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.6)\n", - "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets) (3.8.4)\n", - "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (21.3)\n", - "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.10/site-packages (from datasets) (0.70.14)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (22.1.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.1)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.2)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.8.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.3)\n", - "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (2.1.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (4.4.0)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (3.12.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging->datasets) (3.0.9)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2022.9.24)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.4)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.11)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2022.4)\n", - "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n" - ] - } - ], - "source": [ - "!pip install datasets" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading the Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to extract the dataset. We filter only for those in January 2016." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset hupd (/home/jovyan/.cache/huggingface/datasets/HUPD___hupd/sample-a4eeba92b4229e93/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e39fd26828774c8e9d159a8b5d91c4f5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
patent_numberdecisiontitleabstractclaimsbackgroundsummarydescriptioncpc_labelipc_labelfiling_datepatent_issue_datedate_publishedexaminer_id
013261748ACCEPTEDMINI-OPTICAL NETWORK TERMINAL (ONT)The present invention relates to passive optic...1. A compact optical network terminal, compris...<SOH> BACKGROUND OF THE INVENTION <EOH>A netwo...<SOH> SUMMARY OF THE INVENTION <EOH>An aspect ...FIELD OF THE INVENTION The present invention r...H04Q110071H04Q110020160120201706062016052695191.0
113995128ACCEPTEDAPPARATUS FOR FORMING AND READING AN IDENTIFIC...Embodiments of the invention provide a method ...1. A method comprising: using a first reader t...<SOH> BACKGROUND OF THE INVENTION <EOH>Identif...<SOH> SUMMARY OF THE INVENTION <EOH>In accorda...CROSS-REFERENCE TO RELATED APPLICATIONS The pr...G06K500G06K50020160112201603222014010259514.0
214241799PENDINGPORTABLE DRUG DISPENSERA portable drug dispenser includes a chamber f...1. A portable drug dispenser, comprising: a ch...This application claims priority from U.S. app...A61J70084A61J700201601042017111695928.0
314348792ACCEPTEDLIQUID-COOLED HEAT EXCHANGERA crystal growth furnace comprising a crucible...1. A crystal growth furnace for growing a crys...<SOH> BACKGROUND OF THE INVENTION <EOH>1. Fiel...<SOH> SUMMARY OF THE INVENTION <EOH>The presen...CROSS-REFERENCE TO RELATED APPLICATIONS The pr...C30B11003C30B110020160111201805292016051263013.0
414360978REJECTEDSOLE MEMBER OF FOOTWEARA shoe midsole is composed of a base plate (1)...1. A sole member of footwear comprising a base...<SOH> BACKGROUND ART <EOH>When the heel touche...<SOH> BRIEF DESCRIPTION OF THE DRAWINGS <EOH>F...TECHNICAL FIELD The present invention relates ...A43B13181A43B1318201601132016051294490.0
.............................................
1614815002394ACCEPTEDROBOT HAND CONTROLLING METHOD AND ROBOTICS DEVICEA robot hand controlling method executes calcu...1. A controlling method of a robot hand, the r...<SOH> BACKGROUND OF THE INVENTION <EOH>1. Fiel...<SOH> SUMMARY OF THE INVENTION <EOH>An object ...BACKGROUND OF THE INVENTION 1. Field of the In...B25J91612B25J91620160120201807102016080466148.0
1614915002396REJECTEDIMMUNOGLOBULIN FUSION PROTEINS AND USES THEREOFA fusion protein is disclosed. The fusion prot...1. A fusion protein comprising an Fc fragment ...<SOH> BACKGROUND OF THE INVENTION <EOH>An immu...<SOH> SUMMARY OF THE INVENTION <EOH>The presen...The present application is a U.S. Nonprovision...C07K14745C07K14745201601202016121595819.0
1615015330955REJECTEDPIPE EXTRACTION TOOLA pipe extraction tool that grips the inside o...1. A pipe extraction tool for extracting a pip...<SOH> BACKGROUND OF THE INVENTION <EOH>1. Fiel...<SOH> BRIEF SUMMARY OF THE INVENTION <EOH>The ...CROSS-REFERENCES TO RELATED APPLICATIONS Not a...B25B2714B25B2714201601202017090795661.0
1615115330961PENDINGMolded parts with thermoplastic cellulose biop...A longitudinal extending body with oriented fi...1. A longitudinal body of a solidified organic...<SOH> BACKGROUND OF INVENTION <EOH>In the medi...<SOH> BRIEF SUMMARY OF THE PRESENT INVENTION <...CROSS REFERENCES Application claims priority o...A61L3106A61L3106201601112017101996956.0
1615215330968PENDINGTransmission method with double directivityA transmission method using a massive MIMO (Mu...1. Transmission method with double directivity...<SOH> BACKGROUND OF THE INVENTION <EOH><SOH> BRIEF SUMMARY OF THE INVENTION <EOH>The ...BACKGROUND OF THE INVENTION Field of the Inven...H04B7043H04B704201601142018032970883.0
\n", - "

16153 rows × 14 columns

\n", - "" - ], - "text/plain": [ - " patent_number decision \\\n", - "0 13261748 ACCEPTED \n", - "1 13995128 ACCEPTED \n", - "2 14241799 PENDING \n", - "3 14348792 ACCEPTED \n", - "4 14360978 REJECTED \n", - "... ... ... \n", - "16148 15002394 ACCEPTED \n", - "16149 15002396 REJECTED \n", - "16150 15330955 REJECTED \n", - "16151 15330961 PENDING \n", - "16152 15330968 PENDING \n", - "\n", - " title \\\n", - "0 MINI-OPTICAL NETWORK TERMINAL (ONT) \n", - "1 APPARATUS FOR FORMING AND READING AN IDENTIFIC... \n", - "2 PORTABLE DRUG DISPENSER \n", - "3 LIQUID-COOLED HEAT EXCHANGER \n", - "4 SOLE MEMBER OF FOOTWEAR \n", - "... ... \n", - "16148 ROBOT HAND CONTROLLING METHOD AND ROBOTICS DEVICE \n", - "16149 IMMUNOGLOBULIN FUSION PROTEINS AND USES THEREOF \n", - "16150 PIPE EXTRACTION TOOL \n", - "16151 Molded parts with thermoplastic cellulose biop... \n", - "16152 Transmission method with double directivity \n", - "\n", - " abstract \\\n", - "0 The present invention relates to passive optic... \n", - "1 Embodiments of the invention provide a method ... \n", - "2 A portable drug dispenser includes a chamber f... \n", - "3 A crystal growth furnace comprising a crucible... \n", - "4 A shoe midsole is composed of a base plate (1)... \n", - "... ... \n", - "16148 A robot hand controlling method executes calcu... \n", - "16149 A fusion protein is disclosed. The fusion prot... \n", - "16150 A pipe extraction tool that grips the inside o... \n", - "16151 A longitudinal extending body with oriented fi... \n", - "16152 A transmission method using a massive MIMO (Mu... \n", - "\n", - " claims \\\n", - "0 1. A compact optical network terminal, compris... \n", - "1 1. A method comprising: using a first reader t... \n", - "2 1. A portable drug dispenser, comprising: a ch... \n", - "3 1. A crystal growth furnace for growing a crys... \n", - "4 1. A sole member of footwear comprising a base... \n", - "... ... \n", - "16148 1. A controlling method of a robot hand, the r... \n", - "16149 1. A fusion protein comprising an Fc fragment ... \n", - "16150 1. A pipe extraction tool for extracting a pip... \n", - "16151 1. A longitudinal body of a solidified organic... \n", - "16152 1. Transmission method with double directivity... \n", - "\n", - " background \\\n", - "0 BACKGROUND OF THE INVENTION A netwo... \n", - "1 BACKGROUND OF THE INVENTION Identif... \n", - "2 \n", - "3 BACKGROUND OF THE INVENTION 1. Fiel... \n", - "4 BACKGROUND ART When the heel touche... \n", - "... ... \n", - "16148 BACKGROUND OF THE INVENTION 1. Fiel... \n", - "16149 BACKGROUND OF THE INVENTION An immu... \n", - "16150 BACKGROUND OF THE INVENTION 1. Fiel... \n", - "16151 BACKGROUND OF INVENTION In the medi... \n", - "16152 BACKGROUND OF THE INVENTION \n", - "\n", - " summary \\\n", - "0 SUMMARY OF THE INVENTION An aspect ... \n", - "1 SUMMARY OF THE INVENTION In accorda... \n", - "2 \n", - "3 SUMMARY OF THE INVENTION The presen... \n", - "4 BRIEF DESCRIPTION OF THE DRAWINGS F... \n", - "... ... \n", - "16148 SUMMARY OF THE INVENTION An object ... \n", - "16149 SUMMARY OF THE INVENTION The presen... \n", - "16150 BRIEF SUMMARY OF THE INVENTION The ... \n", - "16151 BRIEF SUMMARY OF THE PRESENT INVENTION <... \n", - "16152 BRIEF SUMMARY OF THE INVENTION The ... \n", - "\n", - " description cpc_label \\\n", - "0 FIELD OF THE INVENTION The present invention r... H04Q110071 \n", - "1 CROSS-REFERENCE TO RELATED APPLICATIONS The pr... G06K500 \n", - "2 This application claims priority from U.S. app... A61J70084 \n", - "3 CROSS-REFERENCE TO RELATED APPLICATIONS The pr... C30B11003 \n", - "4 TECHNICAL FIELD The present invention relates ... A43B13181 \n", - "... ... ... \n", - "16148 BACKGROUND OF THE INVENTION 1. Field of the In... B25J91612 \n", - "16149 The present application is a U.S. Nonprovision... C07K14745 \n", - "16150 CROSS-REFERENCES TO RELATED APPLICATIONS Not a... B25B2714 \n", - "16151 CROSS REFERENCES Application claims priority o... A61L3106 \n", - "16152 BACKGROUND OF THE INVENTION Field of the Inven... H04B7043 \n", - "\n", - " ipc_label filing_date patent_issue_date date_published examiner_id \n", - "0 H04Q1100 20160120 20170606 20160526 95191.0 \n", - "1 G06K500 20160112 20160322 20140102 59514.0 \n", - "2 A61J700 20160104 20171116 95928.0 \n", - "3 C30B1100 20160111 20180529 20160512 63013.0 \n", - "4 A43B1318 20160113 20160512 94490.0 \n", - "... ... ... ... ... ... \n", - "16148 B25J916 20160120 20180710 20160804 66148.0 \n", - "16149 C07K14745 20160120 20161215 95819.0 \n", - "16150 B25B2714 20160120 20170907 95661.0 \n", - "16151 A61L3106 20160111 20171019 96956.0 \n", - "16152 H04B704 20160114 20180329 70883.0 \n", - "\n", - "[16153 rows x 14 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_train" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pre-Processing the Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are interested in the following columns:\n", - "- Abstract\n", - "- Claims\n", - "- Decision <- our `y`\n", - "\n", - "Let's preprocess them both out of our training and validation data\n", - "\n", - "Also, consider that the \"Decision\" column has three types of values: \"Accepted\", \"Rejected\", and \"Pending\". To remove unecessary baggage, we will be only looking for \"Accepted\" and \"Rejected\"." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "necessary_columns = [\"abstract\",\"claims\",\"decision\"]\n", - "output_values = ['ACCEPTED','REJECTED'] " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "trainFeaturesToDrop = [col for col in list(df_train.columns) if col not in necessary_columns]\n", - "trainDF = df_train.dropna()\n", - "trainDF.drop(columns=trainFeaturesToDrop, inplace=True)\n", - "trainDF = trainDF[trainDF['decision'].isin(output_values)]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
decisionabstractclaims
0ACCEPTEDThe present invention relates to passive optic...1. A compact optical network terminal, compris...
1ACCEPTEDEmbodiments of the invention provide a method ...1. A method comprising: using a first reader t...
3ACCEPTEDA crystal growth furnace comprising a crucible...1. A crystal growth furnace for growing a crys...
4REJECTEDA shoe midsole is composed of a base plate (1)...1. A sole member of footwear comprising a base...
5ACCEPTEDA ratchet tool includes a shaft member, a hand...1. A ratchet tool, comprising a shaft member, ...
............
16144ACCEPTEDA wavelength tunable laser device, including: ...1. A wavelength tunable laser device, comprisi...
16145ACCEPTEDIn one aspect, a method for use in preparing a...1. (canceled) 2. The method of claim 19, where...
16148ACCEPTEDA robot hand controlling method executes calcu...1. A controlling method of a robot hand, the r...
16149REJECTEDA fusion protein is disclosed. The fusion prot...1. A fusion protein comprising an Fc fragment ...
16150REJECTEDA pipe extraction tool that grips the inside o...1. A pipe extraction tool for extracting a pip...
\n", - "

8719 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " decision abstract \\\n", - "0 ACCEPTED The present invention relates to passive optic... \n", - "1 ACCEPTED Embodiments of the invention provide a method ... \n", - "3 ACCEPTED A crystal growth furnace comprising a crucible... \n", - "4 REJECTED A shoe midsole is composed of a base plate (1)... \n", - "5 ACCEPTED A ratchet tool includes a shaft member, a hand... \n", - "... ... ... \n", - "16144 ACCEPTED A wavelength tunable laser device, including: ... \n", - "16145 ACCEPTED In one aspect, a method for use in preparing a... \n", - "16148 ACCEPTED A robot hand controlling method executes calcu... \n", - "16149 REJECTED A fusion protein is disclosed. The fusion prot... \n", - "16150 REJECTED A pipe extraction tool that grips the inside o... \n", - "\n", - " claims \n", - "0 1. A compact optical network terminal, compris... \n", - "1 1. A method comprising: using a first reader t... \n", - "3 1. A crystal growth furnace for growing a crys... \n", - "4 1. A sole member of footwear comprising a base... \n", - "5 1. A ratchet tool, comprising a shaft member, ... \n", - "... ... \n", - "16144 1. A wavelength tunable laser device, comprisi... \n", - "16145 1. (canceled) 2. The method of claim 19, where... \n", - "16148 1. A controlling method of a robot hand, the r... \n", - "16149 1. A fusion protein comprising an Fc fragment ... \n", - "16150 1. A pipe extraction tool for extracting a pip... \n", - "\n", - "[8719 rows x 3 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trainDF" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "valFeaturesToDrop = [col for col in list(df_val.columns) if col not in necessary_columns]\n", - "valDF = df_val.dropna()\n", - "valDF.drop(columns=valFeaturesToDrop, inplace=True)\n", - "valDF = valDF[valDF['decision'].isin(output_values)]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
decisionabstractclaims
0REJECTEDRegimen for the treatment of rosacea include t...1. A treatment regimen comprising: cleansing a...
1ACCEPTEDA clamp arrangement includes a pair of bracket...1. A clamp arrangement for supporting a fractu...
2REJECTEDA system and method for device action and conf...1-20. (canceled) 21. A mobile device comprisin...
4REJECTEDSystems and methods for managing datasets prod...1. A method, comprising: executing, by one or ...
9ACCEPTEDA scan driving circuit is provided. The scan d...1. A scan driving circuit for driving a scan l...
............
9085REJECTEDThe non-rigid gate device as described may be ...1; A non-rigid blocking apparatus referred to ...
9090REJECTEDThe present invention provides an improved unc...1. A method for rendering a plastic surface am...
9091ACCEPTEDA method for detecting a software-race conditi...1. A method for detecting a software-race cond...
9092ACCEPTEDThe present application relates to multi-stage...1. A multi-stage amplitude modulation-based me...
9093ACCEPTEDA paper feeder includes a housing, a driving u...1. A paper feeder, comprising: a housing; a dr...
\n", - "

4888 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " decision abstract \\\n", - "0 REJECTED Regimen for the treatment of rosacea include t... \n", - "1 ACCEPTED A clamp arrangement includes a pair of bracket... \n", - "2 REJECTED A system and method for device action and conf... \n", - "4 REJECTED Systems and methods for managing datasets prod... \n", - "9 ACCEPTED A scan driving circuit is provided. The scan d... \n", - "... ... ... \n", - "9085 REJECTED The non-rigid gate device as described may be ... \n", - "9090 REJECTED The present invention provides an improved unc... \n", - "9091 ACCEPTED A method for detecting a software-race conditi... \n", - "9092 ACCEPTED The present application relates to multi-stage... \n", - "9093 ACCEPTED A paper feeder includes a housing, a driving u... \n", - "\n", - " claims \n", - "0 1. A treatment regimen comprising: cleansing a... \n", - "1 1. A clamp arrangement for supporting a fractu... \n", - "2 1-20. (canceled) 21. A mobile device comprisin... \n", - "4 1. A method, comprising: executing, by one or ... \n", - "9 1. A scan driving circuit for driving a scan l... \n", - "... ... \n", - "9085 1; A non-rigid blocking apparatus referred to ... \n", - "9090 1. A method for rendering a plastic surface am... \n", - "9091 1. A method for detecting a software-race cond... \n", - "9092 1. A multi-stage amplitude modulation-based me... \n", - "9093 1. A paper feeder, comprising: a housing; a dr... \n", - "\n", - "[4888 rows x 3 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "valDF" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to replace the values in the `decision` column to numerical representations. We will set \"ACCEPTED\" as `1` and \"REJECTED\" as `0`." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "yKey = {\"ACCEPTED\":1,\"REJECTED\":0}" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "trainDF2 = trainDF.replace({\"decision\": yKey})\n", - "valDF2 = valDF.replace({\"decision\": yKey})" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
decisionabstractclaims
01The present invention relates to passive optic...1. A compact optical network terminal, compris...
11Embodiments of the invention provide a method ...1. A method comprising: using a first reader t...
31A crystal growth furnace comprising a crucible...1. A crystal growth furnace for growing a crys...
40A shoe midsole is composed of a base plate (1)...1. A sole member of footwear comprising a base...
51A ratchet tool includes a shaft member, a hand...1. A ratchet tool, comprising a shaft member, ...
............
161441A wavelength tunable laser device, including: ...1. A wavelength tunable laser device, comprisi...
161451In one aspect, a method for use in preparing a...1. (canceled) 2. The method of claim 19, where...
161481A robot hand controlling method executes calcu...1. A controlling method of a robot hand, the r...
161490A fusion protein is disclosed. The fusion prot...1. A fusion protein comprising an Fc fragment ...
161500A pipe extraction tool that grips the inside o...1. A pipe extraction tool for extracting a pip...
\n", - "

8719 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " decision abstract \\\n", - "0 1 The present invention relates to passive optic... \n", - "1 1 Embodiments of the invention provide a method ... \n", - "3 1 A crystal growth furnace comprising a crucible... \n", - "4 0 A shoe midsole is composed of a base plate (1)... \n", - "5 1 A ratchet tool includes a shaft member, a hand... \n", - "... ... ... \n", - "16144 1 A wavelength tunable laser device, including: ... \n", - "16145 1 In one aspect, a method for use in preparing a... \n", - "16148 1 A robot hand controlling method executes calcu... \n", - "16149 0 A fusion protein is disclosed. The fusion prot... \n", - "16150 0 A pipe extraction tool that grips the inside o... \n", - "\n", - " claims \n", - "0 1. A compact optical network terminal, compris... \n", - "1 1. A method comprising: using a first reader t... \n", - "3 1. A crystal growth furnace for growing a crys... \n", - "4 1. A sole member of footwear comprising a base... \n", - "5 1. A ratchet tool, comprising a shaft member, ... \n", - "... ... \n", - "16144 1. A wavelength tunable laser device, comprisi... \n", - "16145 1. (canceled) 2. The method of claim 19, where... \n", - "16148 1. A controlling method of a robot hand, the r... \n", - "16149 1. A fusion protein comprising an Fc fragment ... \n", - "16150 1. A pipe extraction tool for extracting a pip... \n", - "\n", - "[8719 rows x 3 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trainDF2" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
decisionabstractclaims
00Regimen for the treatment of rosacea include t...1. A treatment regimen comprising: cleansing a...
11A clamp arrangement includes a pair of bracket...1. A clamp arrangement for supporting a fractu...
20A system and method for device action and conf...1-20. (canceled) 21. A mobile device comprisin...
40Systems and methods for managing datasets prod...1. A method, comprising: executing, by one or ...
91A scan driving circuit is provided. The scan d...1. A scan driving circuit for driving a scan l...
............
90850The non-rigid gate device as described may be ...1; A non-rigid blocking apparatus referred to ...
90900The present invention provides an improved unc...1. A method for rendering a plastic surface am...
90911A method for detecting a software-race conditi...1. A method for detecting a software-race cond...
90921The present application relates to multi-stage...1. A multi-stage amplitude modulation-based me...
90931A paper feeder includes a housing, a driving u...1. A paper feeder, comprising: a housing; a dr...
\n", - "

4888 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " decision abstract \\\n", - "0 0 Regimen for the treatment of rosacea include t... \n", - "1 1 A clamp arrangement includes a pair of bracket... \n", - "2 0 A system and method for device action and conf... \n", - "4 0 Systems and methods for managing datasets prod... \n", - "9 1 A scan driving circuit is provided. The scan d... \n", - "... ... ... \n", - "9085 0 The non-rigid gate device as described may be ... \n", - "9090 0 The present invention provides an improved unc... \n", - "9091 1 A method for detecting a software-race conditi... \n", - "9092 1 The present application relates to multi-stage... \n", - "9093 1 A paper feeder includes a housing, a driving u... \n", - "\n", - " claims \n", - "0 1. A treatment regimen comprising: cleansing a... \n", - "1 1. A clamp arrangement for supporting a fractu... \n", - "2 1-20. (canceled) 21. A mobile device comprisin... \n", - "4 1. A method, comprising: executing, by one or ... \n", - "9 1. A scan driving circuit for driving a scan l... \n", - "... ... \n", - "9085 1; A non-rigid blocking apparatus referred to ... \n", - "9090 1. A method for rendering a plastic surface am... \n", - "9091 1. A method for detecting a software-race cond... \n", - "9092 1. A multi-stage amplitude modulation-based me... \n", - "9093 1. A paper feeder, comprising: a housing; a dr... \n", - "\n", - "[4888 rows x 3 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "valDF2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We combine the `abstract` and `claims` columns into a single `text` column. We also re-label the `decision` column to `label`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltext
01The present invention relates to passive optic...
11Embodiments of the invention provide a method ...
31A crystal growth furnace comprising a crucible...
40A shoe midsole is composed of a base plate (1)...
51A ratchet tool includes a shaft member, a hand...
.........
161441A wavelength tunable laser device, including: ...
161451In one aspect, a method for use in preparing a...
161481A robot hand controlling method executes calcu...
161490A fusion protein is disclosed. The fusion prot...
161500A pipe extraction tool that grips the inside o...
\n", - "

8719 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " label text\n", - "0 1 The present invention relates to passive optic...\n", - "1 1 Embodiments of the invention provide a method ...\n", - "3 1 A crystal growth furnace comprising a crucible...\n", - "4 0 A shoe midsole is composed of a base plate (1)...\n", - "5 1 A ratchet tool includes a shaft member, a hand...\n", - "... ... ...\n", - "16144 1 A wavelength tunable laser device, including: ...\n", - "16145 1 In one aspect, a method for use in preparing a...\n", - "16148 1 A robot hand controlling method executes calcu...\n", - "16149 0 A fusion protein is disclosed. The fusion prot...\n", - "16150 0 A pipe extraction tool that grips the inside o...\n", - "\n", - "[8719 rows x 2 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trainDF3 = trainDF2.rename(columns={'decision': 'label'})\n", - "trainDF3['text'] = trainDF3['abstract'] + ' ' + trainDF3['claims']\n", - "trainDF3.drop(columns=[\"abstract\",\"claims\"],inplace=True)\n", - "trainDF3" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltext
00Regimen for the treatment of rosacea include t...
11A clamp arrangement includes a pair of bracket...
20A system and method for device action and conf...
40Systems and methods for managing datasets prod...
91A scan driving circuit is provided. The scan d...
.........
90850The non-rigid gate device as described may be ...
90900The present invention provides an improved unc...
90911A method for detecting a software-race conditi...
90921The present application relates to multi-stage...
90931A paper feeder includes a housing, a driving u...
\n", - "

4888 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " label text\n", - "0 0 Regimen for the treatment of rosacea include t...\n", - "1 1 A clamp arrangement includes a pair of bracket...\n", - "2 0 A system and method for device action and conf...\n", - "4 0 Systems and methods for managing datasets prod...\n", - "9 1 A scan driving circuit is provided. The scan d...\n", - "... ... ...\n", - "9085 0 The non-rigid gate device as described may be ...\n", - "9090 0 The present invention provides an improved unc...\n", - "9091 1 A method for detecting a software-race conditi...\n", - "9092 1 The present application relates to multi-stage...\n", - "9093 1 A paper feeder includes a housing, a driving u...\n", - "\n", - "[4888 rows x 2 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "valDF3 = valDF2.rename(columns={'decision': 'label'})\n", - "valDF3['text'] = valDF3['abstract'] + ' ' + valDF3['claims']\n", - "valDF3.drop(columns=[\"abstract\",\"claims\"],inplace=True)\n", - "valDF3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can grab the data for each column so that we have a list of values for training labels, training texts, validation labels, and validation texts." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "trainLabels = trainDF3[\"label\"].tolist()\n", - "trainText = trainDF3[\"text\"].tolist()\n", - "\n", - "valLabels = valDF3[\"label\"].tolist()\n", - "valText = valDF3[\"text\"].tolist()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading the Trainer\n", - "\n", - "Now we can start training! This time, we will just go with `distilbert-base-uncased` for simplicity." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: torch in /opt/conda/lib/python3.10/site-packages (2.0.0)\n", - "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /opt/conda/lib/python3.10/site-packages (from torch) (11.7.4.91)\n", - "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /opt/conda/lib/python3.10/site-packages (from torch) (11.7.91)\n", - "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch) (3.1.2)\n", - "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.10/site-packages (from torch) (4.4.0)\n", - "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /opt/conda/lib/python3.10/site-packages (from torch) (10.2.10.91)\n", - "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /opt/conda/lib/python3.10/site-packages (from torch) (11.4.0.1)\n", - "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /opt/conda/lib/python3.10/site-packages (from torch) (11.10.3.66)\n", - "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /opt/conda/lib/python3.10/site-packages (from torch) (10.9.0.58)\n", - "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch) (1.11.1)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /opt/conda/lib/python3.10/site-packages (from torch) (11.7.99)\n", - "Requirement already satisfied: triton==2.0.0 in /opt/conda/lib/python3.10/site-packages (from torch) (2.0.0)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch) (3.12.0)\n", - "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch) (2.8.7)\n", - "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /opt/conda/lib/python3.10/site-packages (from torch) (2.14.3)\n", - "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /opt/conda/lib/python3.10/site-packages (from torch) (11.7.101)\n", - "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /opt/conda/lib/python3.10/site-packages (from torch) (11.7.99)\n", - "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /opt/conda/lib/python3.10/site-packages (from torch) (8.5.0.96)\n", - "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (65.4.1)\n", - "Requirement already satisfied: wheel in /opt/conda/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (0.37.1)\n", - "Requirement already satisfied: cmake in /opt/conda/lib/python3.10/site-packages (from triton==2.0.0->torch) (3.26.3)\n", - "Requirement already satisfied: lit in /opt/conda/lib/python3.10/site-packages (from triton==2.0.0->torch) (16.0.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch) (2.1.1)\n", - "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch) (1.2.1)\n", - "Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (4.28.1)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /opt/conda/lib/python3.10/site-packages (from transformers) (0.13.4)\n", - "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from transformers) (6.0)\n", - "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from transformers) (1.23.3)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers) (3.12.0)\n", - "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers) (2023.3.23)\n", - "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.10/site-packages (from transformers) (4.64.1)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from transformers) (2.28.1)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.10/site-packages (from transformers) (0.13.3)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from transformers) (21.3)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.4.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->transformers) (3.0.9)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (1.26.11)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (2022.9.24)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (3.4)\n", - "Requirement already satisfied: charset-normalizer<3,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (2.1.1)\n" - ] - } - ], - "source": [ - "!pip install torch\n", - "!pip install transformers" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from torch.utils.data import Dataset\n", - "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n", - "from transformers import Trainer, TrainingArguments" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"distilbert-base-uncased\"\n", - "class USPTODataset(Dataset):\n", - " def __init__(self, encodings, labels):\n", - " self.encodings = encodings\n", - " self.labels = labels\n", - " def __getitem__(self, idx):\n", - " item = {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}\n", - " item['labels'] = torch.tensor(self.labels[idx])\n", - " return item\n", - " def __len__(self):\n", - " return len(self.labels)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_encodings = tokenizer(trainText, truncation=True, padding=True)\n", - "val_encodings = tokenizer(valText, truncation=True, padding=True)\n", - "\n", - "train_dataset = USPTODataset(train_encodings, trainLabels)\n", - "val_dataset = USPTODataset(val_encodings, valLabels)\n", - "\n", - "train_args = TrainingArguments(\n", - " output_dir=\"./results\",\n", - " num_train_epochs=2,\n", - " per_device_train_batch_size=16,\n", - " per_device_eval_batch_size=64,\n", - " warmup_steps=500,\n", - " learning_rate=5e-5,\n", - " weight_decay=0.01,\n", - " logging_dir=\"./logs\",\n", - " logging_steps=10\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} +{"cells":[{"cell_type":"markdown","metadata":{"id":"BGKCW074dTy2"},"source":["# Harvard USPTO Dataset Training"]},{"cell_type":"markdown","metadata":{"id":"6IttmojFdTy4"},"source":["## Preprocessing USPTO Data"]},{"cell_type":"markdown","source":["### Importing the Dataset\n","\n","We first need to import the actual USPTO dataset."],"metadata":{"id":"rJ6oNXYiOtC3"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"1UCFBK0OdTy5","executionInfo":{"status":"ok","timestamp":1682021338971,"user_tz":240,"elapsed":13759,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"87e553e1-6593-4b2d-e578-2a4e4e742d9b"},"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting datasets\n"," Downloading datasets-2.11.0-py3-none-any.whl (468 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.7/468.7 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from datasets) (1.5.3)\n","Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.9/dist-packages (from datasets) (2023.4.0)\n","Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.9/dist-packages (from datasets) (2.27.1)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from datasets) (1.22.4)\n","Collecting multiprocess\n"," Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.9/132.9 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting responses<0.19\n"," Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n","Collecting xxhash\n"," Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.2/212.2 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from datasets) (23.1)\n","Collecting huggingface-hub<1.0.0,>=0.11.0\n"," Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.1/200.1 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting dill<0.3.7,>=0.3.0\n"," Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.9/dist-packages (from datasets) (4.65.0)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from datasets) (6.0)\n","Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.9/dist-packages (from datasets) (9.0.0)\n","Collecting aiohttp\n"," Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m48.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (2.0.12)\n","Collecting async-timeout<5.0,>=4.0.0a3\n"," Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (23.1.0)\n","Collecting frozenlist>=1.1.1\n"," Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m158.8/158.8 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting aiosignal>=1.1.2\n"," Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n","Collecting yarl<2.0,>=1.0\n"," Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.6/264.6 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting multidict<7.0,>=4.5\n"," Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.2/114.2 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (4.5.0)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (3.11.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->datasets) (3.4)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->datasets) (1.26.15)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->datasets) (2022.12.7)\n","Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->datasets) (2.8.2)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->datasets) (2022.7.1)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n","Installing collected packages: xxhash, multidict, frozenlist, dill, async-timeout, yarl, responses, multiprocess, huggingface-hub, aiosignal, aiohttp, datasets\n","Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 datasets-2.11.0 dill-0.3.6 frozenlist-1.3.3 huggingface-hub-0.13.4 multidict-6.0.4 multiprocess-0.70.14 responses-0.18.0 xxhash-3.2.0 yarl-1.8.2\n"]}],"source":["!pip install datasets"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"V20AfVn8dTy6"},"outputs":[],"source":["from datasets import load_dataset\n","import pandas as pd\n","import numpy as np\n","import os\n","import json\n","import torch\n","import sys"]},{"cell_type":"markdown","metadata":{"id":"DALhUYBydTy7"},"source":["### Loading the Dataset\n","\n","We need to extract the dataset. We filter only for those in January 2016."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":474,"referenced_widgets":["9f321834b0bc4bc1ac089f9813fc0fb1","12e47c63e2fb4596b645c252e9756899","043c0260b1e14a129feab90cff5ef099","ddba4a3380794815a732bd1a453c925c","8d6813d2f8ec401d85e599eadc8dc093","4dca0683fdc5459e88f6687f3196af7f","0dba55c366e44537adf4048c24391786","6695e22c05d548139e71029524c0bc68","2a7b1a0f1e94446bb343979d86264f0c","155a8f30bf6f4bbcb491dda1ad722c3b","aa0b1c2f45104fc5b59e1704bce27e5f","06a520a78f5d4b95982a476b29734cbf","c5be3bfd001346af8d33caf0a64efc7f","de1f71b45aba42e4a807100c4a8bf81f","6eee3b7c1ce9453e8c73f568537d5ac8","8f895536b733460d9ad2987333e3733b","e05d3a3d645948a3b2745cac3d5aa737","da25e097845440948f1d3ec1096f2a99","5cfa97b42072490f8ab4d5f60e0a1792","22f78b57f5604f928475f9a81d723baf","87b97444ca194dd18f1a5bd6e4082fb1","de9855ff33fb41b58ef21fbb26f81b85","9a74f42973b34203af1100702e07c28f","08d60a39b36942c184664157f738c5f0","fc886230dc454a72a3b2954f9818e9e8","482f58d514734fd2aa1d82693c7d5c34","f52a0089a4494e719c54289ff33a1c04","c2f38dc161184b16ba930c2362bc8e3b","e0f8f41cd81647908c645f76483148e5","2326d9dbc48b40769dcde81195324f40","0887166927dd4805ae3346c3158be0d2","04f4e59cfdfe492c9644dd99f46910e9","5cfce12745f44e5a9c94b72120170915","d7bfd7f1e624447a9c1eea68116d915b","6a337183787b43ec8634dc0c5b95bd72","7dd3444c8a754737b4ed1c8f64f68601","d9c18d34c6b249b78fcbace6d79cbdec","52879a2ff4864ba084a15fe02ab8b1fa","a90a02a84860461789ad04c386cdf44e","8cc2a2fe26534066a3a778704cc5984b","754c5121d01646c4aa4284df0eb9bb4f","b957825a21c2412a9b0101869d1f44d8","9f145dfa3f6347a7ab46aaa5fb294ea1","85d3c680e7424ab99c5153f315ac51a0","18d0fb273b444854a97ddb941859e0bc","fedd62c855344bbc931485f1bed1bb20","3cd64accf6104e5f8a3ad59381650a61","d39610354db4493f9f33405267b50179","c82c7f9596a746b6a91435ddfe2801f5","453092a5a76540f2b7561c3f4f84f8b2","ba82d11406a2498ba8948b5729fb4935","50863c3f09e343a9970b73dc70abba90","269546507c224a049088863d503401b1","0cd095c293104c67996f52685a29b7e9","f4c9a13af4cd478bacedd30ee2d81b8d","f9ef1b3c4aa54426af822c5f8420f2ca","28f10c7180d24373ad411015ef51d68c","59139bbd088c4804a9c8213afc3ddf21","a6c68f2b636b4b10ab7f846a789b00aa","417d14969aa141c885ae6ddd6b554324","345e90b85bb542168925cc014a1780ff","0c2447ec9c2346feb9d9d34ad6f5ec89","2c3ac49e5ace478ea09109090519aa65","984011a4cf53494baca77e2847c1a6ec","0ad51ba66c3a48a2bf44a58c63d6f6b8","f71dc5f4ba95461eafff245393c29efb","422a18bc728147cfab41a3d434784b87","8e84ffdf79144dee93ec82a857d8abf8","a6de06c9f8c9494abd2c5146a151abf4","b707d410fc0b46be82513a3156c864ec","7cdeb5f8b21f42549d6919f0a140697a","8fe2841c26c947c28479cd459ae5edff","d40cdc8dac2b40429bd92e6330916ed5","f8c2274beb1d47e1a2e8d76d3f2babe1","a0ade24bceb54a7daedc48858588590a","895d67eeac0f46edb926a8bdd33f419b","21d8d59ca7304455aae43a23c7cbbbe8"]},"id":"d-bfQ8MsdTy8","executionInfo":{"status":"ok","timestamp":1682021405537,"user_tz":240,"elapsed":56565,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"734dcb4b-d924-479d-909c-ef907c2284c8"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading builder script: 0%| | 0.00/14.7k [00:00\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
patent_numberdecisionabstractclaims
013261748ACCEPTEDThe present invention relates to passive optic...1. A compact optical network terminal, compris...
113995128ACCEPTEDEmbodiments of the invention provide a method ...1. A method comprising: using a first reader t...
314348792ACCEPTEDA crystal growth furnace comprising a crucible...1. A crystal growth furnace for growing a crys...
414360978REJECTEDA shoe midsole is composed of a base plate (1)...1. A sole member of footwear comprising a base...
514369795ACCEPTEDA ratchet tool includes a shaft member, a hand...1. A ratchet tool, comprising a shaft member, ...
...............
1614415002390ACCEPTEDA wavelength tunable laser device, including: ...1. A wavelength tunable laser device, comprisi...
1614515002391ACCEPTEDIn one aspect, a method for use in preparing a...1. (canceled) 2. The method of claim 19, where...
1614815002394ACCEPTEDA robot hand controlling method executes calcu...1. A controlling method of a robot hand, the r...
1614915002396REJECTEDA fusion protein is disclosed. The fusion prot...1. A fusion protein comprising an Fc fragment ...
1615015330955REJECTEDA pipe extraction tool that grips the inside o...1. A pipe extraction tool for extracting a pip...
\n","

8719 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n"," \n"," "]},"metadata":{},"execution_count":8}],"source":["trainDF"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"gTW_PbL5dTzD"},"outputs":[],"source":["valFeaturesToDrop = [col for col in list(df_val.columns) if col not in necessary_columns]\n","valDF = df_val.dropna()\n","valDF.drop(columns=valFeaturesToDrop, inplace=True)\n","valDF = valDF[valDF['decision'].isin(output_values)]"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":659},"id":"RHVeHC5QdTzD","executionInfo":{"status":"ok","timestamp":1682021425508,"user_tz":240,"elapsed":5,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"f2341ef5-fb24-4f24-e0c7-0e6cd251233e"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" patent_number decision \\\n","0 13144833 REJECTED \n","1 14006524 ACCEPTED \n","2 14365653 REJECTED \n","4 14396367 REJECTED \n","9 14416282 ACCEPTED \n","... ... ... \n","9085 15011551 REJECTED \n","9090 15011556 REJECTED \n","9091 15011557 ACCEPTED \n","9092 15011558 ACCEPTED \n","9093 15011559 ACCEPTED \n","\n"," abstract \\\n","0 Regimen for the treatment of rosacea include t... \n","1 A clamp arrangement includes a pair of bracket... \n","2 A system and method for device action and conf... \n","4 Systems and methods for managing datasets prod... \n","9 A scan driving circuit is provided. The scan d... \n","... ... \n","9085 The non-rigid gate device as described may be ... \n","9090 The present invention provides an improved unc... \n","9091 A method for detecting a software-race conditi... \n","9092 The present application relates to multi-stage... \n","9093 A paper feeder includes a housing, a driving u... \n","\n"," claims \n","0 1. A treatment regimen comprising: cleansing a... \n","1 1. A clamp arrangement for supporting a fractu... \n","2 1-20. (canceled) 21. A mobile device comprisin... \n","4 1. A method, comprising: executing, by one or ... \n","9 1. A scan driving circuit for driving a scan l... \n","... ... \n","9085 1; A non-rigid blocking apparatus referred to ... \n","9090 1. A method for rendering a plastic surface am... \n","9091 1. A method for detecting a software-race cond... \n","9092 1. A multi-stage amplitude modulation-based me... \n","9093 1. A paper feeder, comprising: a housing; a dr... \n","\n","[4888 rows x 4 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
patent_numberdecisionabstractclaims
013144833REJECTEDRegimen for the treatment of rosacea include t...1. A treatment regimen comprising: cleansing a...
114006524ACCEPTEDA clamp arrangement includes a pair of bracket...1. A clamp arrangement for supporting a fractu...
214365653REJECTEDA system and method for device action and conf...1-20. (canceled) 21. A mobile device comprisin...
414396367REJECTEDSystems and methods for managing datasets prod...1. A method, comprising: executing, by one or ...
914416282ACCEPTEDA scan driving circuit is provided. The scan d...1. A scan driving circuit for driving a scan l...
...............
908515011551REJECTEDThe non-rigid gate device as described may be ...1; A non-rigid blocking apparatus referred to ...
909015011556REJECTEDThe present invention provides an improved unc...1. A method for rendering a plastic surface am...
909115011557ACCEPTEDA method for detecting a software-race conditi...1. A method for detecting a software-race cond...
909215011558ACCEPTEDThe present application relates to multi-stage...1. A multi-stage amplitude modulation-based me...
909315011559ACCEPTEDA paper feeder includes a housing, a driving u...1. A paper feeder, comprising: a housing; a dr...
\n","

4888 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":10}],"source":["valDF"]},{"cell_type":"markdown","metadata":{"id":"YFOqWvPUdTzD"},"source":["We need to replace the values in the `decision` column to numerical representations. We will set \"ACCEPTED\" as `1` and \"REJECTED\" as `0`."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"CBxfqBL0dTzD"},"outputs":[],"source":["yKey = {\"ACCEPTED\":1,\"REJECTED\":0}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"82I6gVrtdTzE"},"outputs":[],"source":["trainDF2 = trainDF.replace({\"decision\": yKey})\n","valDF2 = valDF.replace({\"decision\": yKey})"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":746},"id":"1XVwvlGKdTzE","executionInfo":{"status":"ok","timestamp":1682021428511,"user_tz":240,"elapsed":5,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"bb49c208-ee63-4a2c-86b1-6bea0449b583"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" patent_number decision \\\n","0 13261748 1 \n","1 13995128 1 \n","3 14348792 1 \n","4 14360978 0 \n","5 14369795 1 \n","... ... ... \n","16144 15002390 1 \n","16145 15002391 1 \n","16148 15002394 1 \n","16149 15002396 0 \n","16150 15330955 0 \n","\n"," abstract \\\n","0 The present invention relates to passive optic... \n","1 Embodiments of the invention provide a method ... \n","3 A crystal growth furnace comprising a crucible... \n","4 A shoe midsole is composed of a base plate (1)... \n","5 A ratchet tool includes a shaft member, a hand... \n","... ... \n","16144 A wavelength tunable laser device, including: ... \n","16145 In one aspect, a method for use in preparing a... \n","16148 A robot hand controlling method executes calcu... \n","16149 A fusion protein is disclosed. The fusion prot... \n","16150 A pipe extraction tool that grips the inside o... \n","\n"," claims \n","0 1. A compact optical network terminal, compris... \n","1 1. A method comprising: using a first reader t... \n","3 1. A crystal growth furnace for growing a crys... \n","4 1. A sole member of footwear comprising a base... \n","5 1. A ratchet tool, comprising a shaft member, ... \n","... ... \n","16144 1. A wavelength tunable laser device, comprisi... \n","16145 1. (canceled) 2. The method of claim 19, where... \n","16148 1. A controlling method of a robot hand, the r... \n","16149 1. A fusion protein comprising an Fc fragment ... \n","16150 1. A pipe extraction tool for extracting a pip... \n","\n","[8719 rows x 4 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
patent_numberdecisionabstractclaims
0132617481The present invention relates to passive optic...1. A compact optical network terminal, compris...
1139951281Embodiments of the invention provide a method ...1. A method comprising: using a first reader t...
3143487921A crystal growth furnace comprising a crucible...1. A crystal growth furnace for growing a crys...
4143609780A shoe midsole is composed of a base plate (1)...1. A sole member of footwear comprising a base...
5143697951A ratchet tool includes a shaft member, a hand...1. A ratchet tool, comprising a shaft member, ...
...............
16144150023901A wavelength tunable laser device, including: ...1. A wavelength tunable laser device, comprisi...
16145150023911In one aspect, a method for use in preparing a...1. (canceled) 2. The method of claim 19, where...
16148150023941A robot hand controlling method executes calcu...1. A controlling method of a robot hand, the r...
16149150023960A fusion protein is disclosed. The fusion prot...1. A fusion protein comprising an Fc fragment ...
16150153309550A pipe extraction tool that grips the inside o...1. A pipe extraction tool for extracting a pip...
\n","

8719 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":13}],"source":["trainDF2"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":659},"id":"D0eRJb_2dTzE","executionInfo":{"status":"ok","timestamp":1682021428713,"user_tz":240,"elapsed":4,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"74150843-f3b4-459c-ef34-d9cf08f0c0d6"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" patent_number decision \\\n","0 13144833 0 \n","1 14006524 1 \n","2 14365653 0 \n","4 14396367 0 \n","9 14416282 1 \n","... ... ... \n","9085 15011551 0 \n","9090 15011556 0 \n","9091 15011557 1 \n","9092 15011558 1 \n","9093 15011559 1 \n","\n"," abstract \\\n","0 Regimen for the treatment of rosacea include t... \n","1 A clamp arrangement includes a pair of bracket... \n","2 A system and method for device action and conf... \n","4 Systems and methods for managing datasets prod... \n","9 A scan driving circuit is provided. The scan d... \n","... ... \n","9085 The non-rigid gate device as described may be ... \n","9090 The present invention provides an improved unc... \n","9091 A method for detecting a software-race conditi... \n","9092 The present application relates to multi-stage... \n","9093 A paper feeder includes a housing, a driving u... \n","\n"," claims \n","0 1. A treatment regimen comprising: cleansing a... \n","1 1. A clamp arrangement for supporting a fractu... \n","2 1-20. (canceled) 21. A mobile device comprisin... \n","4 1. A method, comprising: executing, by one or ... \n","9 1. A scan driving circuit for driving a scan l... \n","... ... \n","9085 1; A non-rigid blocking apparatus referred to ... \n","9090 1. A method for rendering a plastic surface am... \n","9091 1. A method for detecting a software-race cond... \n","9092 1. A multi-stage amplitude modulation-based me... \n","9093 1. A paper feeder, comprising: a housing; a dr... \n","\n","[4888 rows x 4 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
patent_numberdecisionabstractclaims
0131448330Regimen for the treatment of rosacea include t...1. A treatment regimen comprising: cleansing a...
1140065241A clamp arrangement includes a pair of bracket...1. A clamp arrangement for supporting a fractu...
2143656530A system and method for device action and conf...1-20. (canceled) 21. A mobile device comprisin...
4143963670Systems and methods for managing datasets prod...1. A method, comprising: executing, by one or ...
9144162821A scan driving circuit is provided. The scan d...1. A scan driving circuit for driving a scan l...
...............
9085150115510The non-rigid gate device as described may be ...1; A non-rigid blocking apparatus referred to ...
9090150115560The present invention provides an improved unc...1. A method for rendering a plastic surface am...
9091150115571A method for detecting a software-race conditi...1. A method for detecting a software-race cond...
9092150115581The present application relates to multi-stage...1. A multi-stage amplitude modulation-based me...
9093150115591A paper feeder includes a housing, a driving u...1. A paper feeder, comprising: a housing; a dr...
\n","

4888 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":14}],"source":["valDF2"]},{"cell_type":"markdown","metadata":{"id":"gRb0ApxTdTzF"},"source":["We re-label the `decision` column to `label`."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":676},"id":"vRFoJ8hidTzF","executionInfo":{"status":"ok","timestamp":1682021435301,"user_tz":240,"elapsed":500,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"9444d1d4-5450-4767-aeb0-c1709889b7ac"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" patent_number label abstract \\\n","0 13261748 1 The present invention relates to passive optic... \n","1 13995128 1 Embodiments of the invention provide a method ... \n","3 14348792 1 A crystal growth furnace comprising a crucible... \n","4 14360978 0 A shoe midsole is composed of a base plate (1)... \n","5 14369795 1 A ratchet tool includes a shaft member, a hand... \n","... ... ... ... \n","16144 15002390 1 A wavelength tunable laser device, including: ... \n","16145 15002391 1 In one aspect, a method for use in preparing a... \n","16148 15002394 1 A robot hand controlling method executes calcu... \n","16149 15002396 0 A fusion protein is disclosed. The fusion prot... \n","16150 15330955 0 A pipe extraction tool that grips the inside o... \n","\n"," claims \n","0 1. A compact optical network terminal, compris... \n","1 1. A method comprising: using a first reader t... \n","3 1. A crystal growth furnace for growing a crys... \n","4 1. A sole member of footwear comprising a base... \n","5 1. A ratchet tool, comprising a shaft member, ... \n","... ... \n","16144 1. A wavelength tunable laser device, comprisi... \n","16145 1. (canceled) 2. The method of claim 19, where... \n","16148 1. A controlling method of a robot hand, the r... \n","16149 1. A fusion protein comprising an Fc fragment ... \n","16150 1. A pipe extraction tool for extracting a pip... \n","\n","[8719 rows x 4 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
patent_numberlabelabstractclaims
0132617481The present invention relates to passive optic...1. A compact optical network terminal, compris...
1139951281Embodiments of the invention provide a method ...1. A method comprising: using a first reader t...
3143487921A crystal growth furnace comprising a crucible...1. A crystal growth furnace for growing a crys...
4143609780A shoe midsole is composed of a base plate (1)...1. A sole member of footwear comprising a base...
5143697951A ratchet tool includes a shaft member, a hand...1. A ratchet tool, comprising a shaft member, ...
...............
16144150023901A wavelength tunable laser device, including: ...1. A wavelength tunable laser device, comprisi...
16145150023911In one aspect, a method for use in preparing a...1. (canceled) 2. The method of claim 19, where...
16148150023941A robot hand controlling method executes calcu...1. A controlling method of a robot hand, the r...
16149150023960A fusion protein is disclosed. The fusion prot...1. A fusion protein comprising an Fc fragment ...
16150153309550A pipe extraction tool that grips the inside o...1. A pipe extraction tool for extracting a pip...
\n","

8719 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":15}],"source":["trainDF3 = trainDF2.rename(columns={'decision': 'label'})\n","trainDF3"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":641},"id":"v3Qvaex7dTzG","executionInfo":{"status":"ok","timestamp":1682021437285,"user_tz":240,"elapsed":6,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"3c644844-db7e-4c3d-da66-afba74a1ca9a"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" patent_number label abstract \\\n","0 13144833 0 Regimen for the treatment of rosacea include t... \n","1 14006524 1 A clamp arrangement includes a pair of bracket... \n","2 14365653 0 A system and method for device action and conf... \n","4 14396367 0 Systems and methods for managing datasets prod... \n","9 14416282 1 A scan driving circuit is provided. The scan d... \n","... ... ... ... \n","9085 15011551 0 The non-rigid gate device as described may be ... \n","9090 15011556 0 The present invention provides an improved unc... \n","9091 15011557 1 A method for detecting a software-race conditi... \n","9092 15011558 1 The present application relates to multi-stage... \n","9093 15011559 1 A paper feeder includes a housing, a driving u... \n","\n"," claims \n","0 1. A treatment regimen comprising: cleansing a... \n","1 1. A clamp arrangement for supporting a fractu... \n","2 1-20. (canceled) 21. A mobile device comprisin... \n","4 1. A method, comprising: executing, by one or ... \n","9 1. A scan driving circuit for driving a scan l... \n","... ... \n","9085 1; A non-rigid blocking apparatus referred to ... \n","9090 1. A method for rendering a plastic surface am... \n","9091 1. A method for detecting a software-race cond... \n","9092 1. A multi-stage amplitude modulation-based me... \n","9093 1. A paper feeder, comprising: a housing; a dr... \n","\n","[4888 rows x 4 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
patent_numberlabelabstractclaims
0131448330Regimen for the treatment of rosacea include t...1. A treatment regimen comprising: cleansing a...
1140065241A clamp arrangement includes a pair of bracket...1. A clamp arrangement for supporting a fractu...
2143656530A system and method for device action and conf...1-20. (canceled) 21. A mobile device comprisin...
4143963670Systems and methods for managing datasets prod...1. A method, comprising: executing, by one or ...
9144162821A scan driving circuit is provided. The scan d...1. A scan driving circuit for driving a scan l...
...............
9085150115510The non-rigid gate device as described may be ...1; A non-rigid blocking apparatus referred to ...
9090150115560The present invention provides an improved unc...1. A method for rendering a plastic surface am...
9091150115571A method for detecting a software-race conditi...1. A method for detecting a software-race cond...
9092150115581The present application relates to multi-stage...1. A multi-stage amplitude modulation-based me...
9093150115591A paper feeder includes a housing, a driving u...1. A paper feeder, comprising: a housing; a dr...
\n","

4888 rows × 4 columns

\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":16}],"source":["valDF3 = valDF2.rename(columns={'decision': 'label'})\n","valDF3"]},{"cell_type":"markdown","metadata":{"id":"hJ8DMaCXdTzG"},"source":["We can grab the data for each column so that we have a list of values for training labels, training texts, validation labels, and validation texts.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"z9omfOd0dTzG"},"outputs":[],"source":["trainData = {\n"," \"patent_numbers\":trainDF3[\"patent_number\"].tolist(),\n"," \"labels\":trainDF3[\"label\"].tolist(),\n"," \"abstracts\":trainDF3[\"abstract\"].tolist(),\n"," \"claims\":trainDF3[\"claims\"].tolist(),\n","}\n","valData = {\n"," \"patent_numbers\":valDF3[\"patent_number\"].tolist(),\n"," \"labels\":valDF3[\"label\"].tolist(),\n"," \"abstracts\":valDF3[\"abstract\"].tolist(),\n"," \"claims\":valDF3[\"claims\"].tolist(),\n","}"]},{"cell_type":"markdown","source":["We will save these dictionaries as data for later."],"metadata":{"id":"CLeEbFI_NBuK"}},{"cell_type":"code","source":["if not os.path.exists(\"./data\"):\n"," os.makedirs('./data')\n","\n","with open(\"./data/train.json\", \"w\") as outfile:\n"," json.dump(trainData, outfile, indent=2)\n","with open(\"./data/val.json\", \"w\") as outfile:\n"," json.dump(valData, outfile, indent=2)"],"metadata":{"id":"NBPNxz7qNHRq"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"pE3HG8bUdTzG"},"source":["## Loading the Trainer\n","\n","Now we can start training! This time, we will just go with `distilbert-base-uncased` for simplicity."]},{"cell_type":"markdown","source":["### Initializing Classes and Trainers"],"metadata":{"id":"YklaXlgDO6Jw"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hxE_CIT_dTzH","executionInfo":{"status":"ok","timestamp":1682021471720,"user_tz":240,"elapsed":16542,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"outputId":"758b0092-d56e-47b6-852a-4a19915bfe0c"},"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Requirement already satisfied: torch in /usr/local/lib/python3.9/dist-packages (2.0.0+cu118)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from torch) (3.1.2)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.9/dist-packages (from torch) (1.11.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from torch) (3.11.0)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.9/dist-packages (from torch) (2.0.0)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from torch) (3.1)\n","Requirement already satisfied: lit in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch) (16.0.1)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch) (3.25.2)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->torch) (2.1.2)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.9/dist-packages (from sympy->torch) (1.3.0)\n","Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting transformers\n"," Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m81.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.11.0)\n","Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n"," Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m100.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.22.4)\n","Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (0.13.4)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n","Installing collected packages: tokenizers, transformers\n","Successfully installed tokenizers-0.13.3 transformers-4.28.1\n"]}],"source":["!pip install torch\n","!pip install transformers"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"i8_0Ih_WdTzH"},"outputs":[],"source":["from torch.utils.data import Dataset, DataLoader\n","from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n","from transformers import Trainer, TrainingArguments, AdamW"]},{"cell_type":"code","source":["torch.backends.cuda.matmul.allow_tf32 = True\n","model_name = \"distilbert-base-uncased\"\n","upsto_abstracts_model_path = './models/uspto_abstracts'\n","upsto_claims_model_path = './models/uspto_claims'"],"metadata":{"id":"wXkvS5h2NrzW","executionInfo":{"status":"ok","timestamp":1682032710087,"user_tz":240,"elapsed":217,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}}},"execution_count":39,"outputs":[]},{"cell_type":"markdown","source":["We will create a Dataset class for the training"],"metadata":{"id":"awXD1_ltNxPC"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"yVi-Vhb-dTzH"},"outputs":[],"source":["class USPTODataset(Dataset):\n"," def __init__(self, encodings, labels):\n"," self.encodings = encodings\n"," self.labels = labels\n"," def __getitem__(self, idx):\n"," item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n"," item['labels'] = torch.tensor(self.labels[idx])\n"," return item\n"," def __len__(self):\n"," return len(self.labels)\n"]},{"cell_type":"markdown","source":["### Double-Checking the Data\n","\n","We will do a basic check: Do we have `trainData` and `valData` cached? If not, we need to load it in!"],"metadata":{"id":"ZXqCGaTxN7qy"}},{"cell_type":"code","source":["trainDataPath = \"./data/train.json\"\n","valDataPath = \"./data/val.json\"\n","\n","if trainData is None and os.path.exists(trainDataPath):\n"," f = open(trainDataPath)\n"," trainData = json.load(f)\n"," f.close()\n","if valData is None and os.path.exists(valDataPath):\n"," f = open(valDataPath)\n"," valData = json.load(f)\n"," f.close()"],"metadata":{"id":"8Szn0TJ-N7CI"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Encoding the Data"],"metadata":{"id":"V3oKe81RPIgq"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"4Cxzzr6KdTzI","executionInfo":{"status":"ok","timestamp":1682021490811,"user_tz":240,"elapsed":1763,"user":{"displayName":"Ryan Kim","userId":"18356277368138721144"}},"colab":{"base_uri":"https://localhost:8080/","height":257,"referenced_widgets":["37e352aeab994637887c9fce16a4fdda","7ce12e08913445429c0f44752b5f821c","6999d35a0c09459b9d0c9d47dba70320","5dc74e126ba4481e8e15ffa59b1eaf8e","3d9ca529621f46da9ed93641ae56b4ea","d9e649a7a52641b28b77037fc4713d77","3d83814aa933459dac4d493ab6c2ecf7","a08f7de9b7284616a3a6f2176804a714","2f624544ba68401491be11bb78cc8086","e7d6e1e3fb9a49e0b58281aca52517b5","51b1280a3a5e4facbeafb28923d77133","0da194d892754092ad01803ff69c9a7c","1e085de4a50e4c2685b9d24e0f289679","373fdb0d94684d44aa5e0e6293319bc6","c31e597bc5c14d14b287206ea8be2522","acf120e8d7f14a23a7a8a8f6d2c72d54","8bbb84dc028a4b62b1ea4dcd98131706","72125209dca54decaae05e5678a9eb60","659e0520847d4db5a5cf717a7be903b1","7077cb21a4b9491ab20b2af5dd7d30e5","517fdc4c1e61453f9e167dd8cc33f021","055f263ecfab430da77808fdc07699a1","cb4f082d2c384b74a54bac7e92b19772","12f8fa71da0d434a88c43ab13159fbc6","cb2d86cc73fd4a529d75aeb8e9c354ae","28bff5766c51461e8b9456c07aac9c57","8f3f4ca0a7114fb3929b2b80402c19ad","097daf3ff77f4d39809fe3a9d5bbd3c3","e53f41626ff34cbca574ef5be6b910e9","e7a1f0216c184d5e8abee0f4998f7cb7","ec7f6f10a68f4aa3b1696e4e1d59c231","041087211da7424e86b03574c00bcc7e","44c305d3e3ec44a1ac31a9e82ee00fd5","6cce9c60a7074c40ad9992597eb1f50a","87cad6102054466d8e1243da205cf506","5d1ae7f7479e485a97e80db391b6e694","789cf158a3154bba8b1091b2ec443843","f4d8392b478149949a77bf606fea3090","d457e5284b6e4ecf8efddff65b613315","abff237c84fe446f857de2c7c6fc466c","e34a8a0a27614aab95e63b221861965f","bf607dd1b0ba47c2a4b42cd934786356","f66a864297f1446d92968786100fa6ef","035fd49261424e179b16f2ae4688944e"]},"outputId":"d1afd722-6591-4860-db86-5bb9ffd58e7d"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading (…)okenizer_config.json: 0%| | 0.00/28.0 [00:00