{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T12:54:01.369853Z", "start_time": "2021-07-14T12:49:27.961404Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration default-fdc6acb780b42528\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset recipe_nlg/default (download: Unknown size, generated: 2.04 GiB, post-processed: Unknown size, total: 2.04 GiB) to /home/rtx/.cache/huggingface/datasets/recipe_nlg/default-fdc6acb780b42528/1.0.0/20c969e1192265af03a7186457bdb4a9109d5d68b92cad04c3ec894d6e5aee61...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Dataset recipe_nlg downloaded and prepared to /home/rtx/.cache/huggingface/datasets/recipe_nlg/default-fdc6acb780b42528/1.0.0/20c969e1192265af03a7186457bdb4a9109d5d68b92cad04c3ec894d6e5aee61. Subsequent calls will reuse this data.\n" ] } ], "source": [ "from datasets import load_dataset\n", "DATA_DIR = \"~/Downloads/dataset/\"\n", "dataset = load_dataset(\"recipe_nlg\", data_dir=DATA_DIR)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T12:58:25.150105Z", "start_time": "2021-07-14T12:55:27.486385Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 2231142/2231142 [02:57<00:00, 12558.59it/s]\n" ] } ], "source": [ "from collections import Counter\n", "from tqdm import tqdm\n", "ctr = Counter()\n", "\n", "for row in tqdm(dataset[\"train\"]):\n", " for item in row[\"ner\"]:\n", " ctr[item] += 1" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T13:02:09.315817Z", "start_time": "2021-07-14T13:02:09.259046Z" } }, "outputs": [], "source": [ "first_500 = list(set([x[0].lower() for x in ctr.most_common()[0:500]]))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T13:02:28.864546Z", "start_time": "2021-07-14T13:02:28.856279Z" } }, "outputs": [ { "data": { "text/plain": [ "443" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(first_500)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T13:02:53.656711Z", "start_time": "2021-07-14T13:02:53.653868Z" } }, "outputs": [], "source": [ "first_100 = sorted(first_500[:100])\n", "next_100 = sorted(first_500[100:200])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T13:03:35.640538Z", "start_time": "2021-07-14T13:03:35.634368Z" } }, "outputs": [], "source": [ "d = {\n", " \"first_100\": first_100,\n", " \"next_100\": next_100\n", "}" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2021-07-14T13:03:52.682190Z", "start_time": "2021-07-14T13:03:52.679624Z" } }, "outputs": [], "source": [ "import json\n", "with open(\"config.json\", \"w\") as f:\n", " f.write(json.dumps(d))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }