{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "882ae3ed", "metadata": {}, "outputs": [], "source": [ "import datasets" ] }, { "cell_type": "code", "execution_count": 2, "id": "f6b5f6bf", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset reuters21578 (/Users/juliensimon/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d47dfac1e7e54f87bcf922b4616bfc9b", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2 [00:00, which also owns a 55 pct interest in Standard Oil.\\n The venture will be called BP/Standard Financial Trading\\nand will be operated by Standard Oil under the oversight of a\\njoint management committee.\\n\\n Reuter\\n',\n", " 'target': 'STANDARD OIL <SRD> TO FORM FINANCIAL UNIT'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['train'][1]" ] }, { "cell_type": "code", "execution_count": 7, "id": "6947b4a0", "metadata": {}, "outputs": [], "source": [ "def clean(row):\n", " row['text'] = row['text'].replace('\\n',' ').replace('\\t',' ')\\\n", " .replace(',','').replace('\\'','').replace('\\\"','')\\\n", " .replace(' Reuter','').replace(' REUTER','')\n", " row['text'] = \" \".join(row['text'].split())\n", " row['target'] = row['target'].replace('<','<').replace('>','>')\n", " return row" ] }, { "cell_type": "code", "execution_count": 8, "id": "deec4da9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /Users/juliensimon/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f/cache-80b0dc9c8071ba93.arrow\n", "Loading cached processed dataset at /Users/juliensimon/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f/cache-38a3b39f977c1f02.arrow\n" ] } ], "source": [ "dataset = dataset.map(clean)" ] }, { "cell_type": "code", "execution_count": 9, "id": "d6aa8777", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'text': 'Standard Oil Co and BP North America Inc said they plan to form a venture to manage the money market borrowing and investment activities of both companies. BP North America is a subsidiary of British Petroleum Co Plc <BP> which also owns a 55 pct interest in Standard Oil. The venture will be called BP/Standard Financial Trading and will be operated by Standard Oil under the oversight of a joint management committee.',\n", " 'target': 'STANDARD OIL TO FORM FINANCIAL UNIT'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['train'][1]" ] }, { "cell_type": "code", "execution_count": 10, "id": "b0562fe2", "metadata": {}, "outputs": [], "source": [ "dataset.save_to_disk('reuters_processed')" ] }, { "cell_type": "code", "execution_count": null, "id": "d2e6a3e4", "metadata": {}, "outputs": [], "source": [ "dataset['train'].to_csv('reuters_train.csv', index=False, header=True)\n", "dataset['test'].to_csv('reuters_test.csv', index=False, header=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "d79a04f2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }