{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "1383f909", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (4.11.3)\n", "Collecting datasets\n", " Downloading datasets-1.15.1-py3-none-any.whl (290 kB)\n", "\u001b[K |████████████████████████████████| 290 kB 2.4 MB/s eta 0:00:01\n", "\u001b[?25hCollecting fsspec[http]>=2021.05.0\n", " Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)\n", "\u001b[K |████████████████████████████████| 132 kB 3.8 MB/s eta 0:00:01\n", "\u001b[?25hCollecting multiprocess\n", " Downloading multiprocess-0.70.12.2-py38-none-any.whl (128 kB)\n", "\u001b[K |████████████████████████████████| 128 kB 5.5 MB/s eta 0:00:01\n", "\u001b[?25hCollecting dill\n", " Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)\n", "\u001b[K |████████████████████████████████| 86 kB 4.4 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: pandas in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from datasets) (1.2.4)\n", "Collecting huggingface-hub<1.0.0,>=0.1.0\n", " Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)\n", "\u001b[K |████████████████████████████████| 59 kB 4.4 MB/s eta 0:00:01\n", "\u001b[?25hCollecting xxhash\n", " Downloading xxhash-2.0.2-cp38-cp38-macosx_10_9_x86_64.whl (31 kB)\n", "Requirement already satisfied: requests>=2.19.0 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from datasets) (2.25.1)\n", "Collecting tqdm>=4.62.1\n", " Using cached tqdm-4.62.3-py2.py3-none-any.whl (76 kB)\n", "Collecting pyarrow!=4.0.0,>=1.0.0\n", " Downloading pyarrow-6.0.1-cp38-cp38-macosx_10_13_x86_64.whl (19.1 MB)\n", "\u001b[K |████████████████████████████████| 19.1 MB 5.9 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from datasets) (1.20.1)\n", "Collecting aiohttp\n", " Downloading aiohttp-3.8.1-cp38-cp38-macosx_10_9_x86_64.whl (574 kB)\n", "\u001b[K |████████████████████████████████| 574 kB 5.0 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: packaging in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from datasets) (20.9)\n", "Requirement already satisfied: pyyaml in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (5.4.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.7.4.3)\n", "Requirement already satisfied: filelock in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.0.12)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from packaging->datasets) (2.4.7)\n", "Requirement already satisfied: idna<3,>=2.5 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (1.26.4)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (4.0.0)\n", "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from transformers) (0.10.3)\n", "Requirement already satisfied: sacremoses in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from transformers) (0.0.46)\n", "Requirement already satisfied: regex!=2019.12.17 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from transformers) (2021.4.4)\n", "Collecting charset-normalizer<3.0,>=2.0\n", " Using cached charset_normalizer-2.0.7-py3-none-any.whl (38 kB)\n", "Collecting frozenlist>=1.1.1\n", " Downloading frozenlist-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl (81 kB)\n", "\u001b[K |████████████████████████████████| 81 kB 7.0 MB/s eta 0:00:011\n", "\u001b[?25hCollecting multidict<7.0,>=4.5\n", " Downloading multidict-5.2.0-cp38-cp38-macosx_10_9_x86_64.whl (45 kB)\n", "\u001b[K |████████████████████████████████| 45 kB 4.0 MB/s eta 0:00:011\n", "\u001b[?25hCollecting yarl<2.0,>=1.0\n", " Downloading yarl-1.7.2-cp38-cp38-macosx_10_9_x86_64.whl (121 kB)\n", "\u001b[K |████████████████████████████████| 121 kB 5.8 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from aiohttp->datasets) (20.3.0)\n", "Collecting async-timeout<5.0,>=4.0.0a3\n", " Downloading async_timeout-4.0.1-py3-none-any.whl (5.7 kB)\n", "Collecting aiosignal>=1.1.2\n", " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from pandas->datasets) (2.8.1)\n", "Requirement already satisfied: pytz>=2017.3 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from pandas->datasets) (2021.1)\n", "Requirement already satisfied: six>=1.5 in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", "Requirement already satisfied: joblib in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from sacremoses->transformers) (1.0.1)\n", "Requirement already satisfied: click in /Users/markmcquade/opt/anaconda3/lib/python3.8/site-packages (from sacremoses->transformers) (7.1.2)\n", "Installing collected packages: multidict, frozenlist, yarl, charset-normalizer, async-timeout, aiosignal, tqdm, fsspec, dill, aiohttp, xxhash, pyarrow, multiprocess, huggingface-hub, datasets\n", " Attempting uninstall: tqdm\n", " Found existing installation: tqdm 4.59.0\n", " Uninstalling tqdm-4.59.0:\n", " Successfully uninstalled tqdm-4.59.0\n", " Attempting uninstall: fsspec\n", " Found existing installation: fsspec 0.9.0\n", " Uninstalling fsspec-0.9.0:\n", " Successfully uninstalled fsspec-0.9.0\n", " Attempting uninstall: huggingface-hub\n", " Found existing installation: huggingface-hub 0.0.19\n", " Uninstalling huggingface-hub-0.0.19:\n", " Successfully uninstalled huggingface-hub-0.0.19\n", "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.1 charset-normalizer-2.0.7 datasets-1.15.1 dill-0.3.4 frozenlist-1.2.0 fsspec-2021.11.0 huggingface-hub-0.1.2 multidict-5.2.0 multiprocess-0.70.12.2 pyarrow-6.0.1 tqdm-4.62.3 xxhash-2.0.2 yarl-1.7.2\n" ] } ], "source": [ "!pip install transformers datasets" ] }, { "cell_type": "code", "execution_count": 3, "id": "d0ec8542", "metadata": {}, "outputs": [], "source": [ "import datasets" ] }, { "cell_type": "code", "execution_count": 6, "id": "248be0df", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset reuters21578 (/Users/markmcquade/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6cbd2502b9d54505b2a60aa8f809a2f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2 [00:00, which also owns a 55 pct interest in Standard Oil.\\n The venture will be called BP/Standard Financial Trading\\nand will be operated by Standard Oil under the oversight of a\\njoint management committee.\\n\\n Reuter\\n',\n", " 'target': 'STANDARD OIL <SRD> TO FORM FINANCIAL UNIT'}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['train'][1]" ] }, { "cell_type": "markdown", "id": "4151eddf", "metadata": {}, "source": [ "Drop new lines and tabs, replace with white space. Remove commas, remove quotes. Replace \"Reuter\" from end of text. Drop extra white space agains. Remove html tags" ] }, { "cell_type": "code", "execution_count": 22, "id": "76bc67d3", "metadata": {}, "outputs": [], "source": [ "def clean(row):\n", " row['text'] = row['text'].replace('\\n',' ').replace('\\t',' ')\\\n", " .replace(',','').replace('\\'','').replace('\\\"','')\\\n", " .replace(' Reuter','').replace(' REUTER','')\n", " row['text'] = \" \".join(row['text'].split())\n", " row['target'] = row['target'].replace('<','<').replace('>','>')\n", " return row" ] }, { "cell_type": "code", "execution_count": 23, "id": "40bff5a5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7a4b761c23b84a138fa74ef9dfe9ed49", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/722 [00:00 which also owns a 55 pct interest in Standard Oil. The venture will be called BP/Standard Financial Trading and will be operated by Standard Oil under the oversight of a joint management committee.',\n", " 'target': 'STANDARD OIL TO FORM FINANCIAL UNIT'}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['train'][1]" ] }, { "cell_type": "code", "execution_count": 25, "id": "22c4da2e", "metadata": {}, "outputs": [], "source": [ "dataset.save_to_disk('reuters_processed')" ] }, { "cell_type": "code", "execution_count": 27, "id": "95ef34aa", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "17b994c03fe84d0082f285fd972c4140", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Creating CSV from Arrow format: 0%| | 0/3 [00:00