{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "cPWaiNGVZfB1" }, "source": [ "## Get the JSONL file from github repo" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kwTs86SwToom", "outputId": "f5562864-a61d-4ff6-f89a-553047685468" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2024-03-29 17:12:50-- https://raw.githubusercontent.com/DagimB/Text_Mining_ecfr_title12/main/ecfr-title-12.jsonl\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17239826 (16M) [text/plain]\n", "Saving to: ‘ecfr-title-12.jsonl’\n", "\n", "ecfr-title-12.jsonl 100%[===================>] 16.44M 96.6MB/s in 0.2s \n", "\n", "2024-03-29 17:12:50 (96.6 MB/s) - ‘ecfr-title-12.jsonl’ saved [17239826/17239826]\n", "\n" ] } ], "source": [ "# Accessing the title 12 JSONL file\n", "\n", "!wget https://raw.githubusercontent.com/ManjinderUNCC/prodigy-ecfr-textcat/main/data/ecfr-title-12.jsonl" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4nZE63owa0Q5", "outputId": "3a20455c-6f65-4da2-af36-ce9a13e507c1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "\n", "# Mount Google Drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "markdown", "metadata": { "id": "Bipzb8pwcm5T" }, "source": [ "## Convert JSONL file into an array" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vvQtsxxxZQs9" }, "outputs": [], "source": [ "# deletes the first line of the jsonl file and saves the rest as output.jsonl\n", "\n", "import json\n", "\n", "with open('ecfr-title-12.jsonl', 'r', encoding='utf-8') as infile:\n", " next(infile)\n", " with open('/content/drive/My Drive/output.jsonl', 'w', encoding='utf-8') as outfile:\n", " for line in infile:\n", " outfile.write(line)\n", "\n", "file_path = '/content/drive/My Drive/output.jsonl'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dDbDFyUdcmIr" }, "outputs": [], "source": [ "def jsonl_to_array(file_path):\n", " data = []\n", " with open(file_path, 'r') as file:\n", " for line in file:\n", " # Strip any leading/trailing whitespace and parse JSON\n", " json_data = json.loads(line.strip())\n", " data.append(json_data)\n", " return data\n", "\n", "data_array = jsonl_to_array(file_path)" ] }, { "cell_type": "markdown", "metadata": { "id": "9njD4a2ee7xl" }, "source": [ "## Split array into train and evaluation using train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x3rGk_ISfPUs", "outputId": "fb1ec5d4-5fd7-4ff4-e0b5-175fb16b6c31" }, "outputs": [ { "data": { "text/plain": [ "4665" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# see how long the array is\n", "len(data_array)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "isJmBgRGe2rc", "outputId": "13f453a7-4c1b-4527-c334-0e02c09caf52" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Length of eval dataset: 200\n" ] } ], "source": [ "# Use train_test_split to get an evaluation dataset that is 200 in length\n", "\n", "from sklearn.model_selection import train_test_split\n", "train_data, eval_data = train_test_split(data_array, test_size=0.0428, random_state=42)\n", "\n", "print(f\"Length of eval dataset: {len(eval_data)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F8Rjwp8Bh3EP", "outputId": "c2831246-fb42-41b5-aba7-10f99c50fee7" }, "outputs": [ { "data": { "text/plain": [ "4465" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "mIggR8P2LHLl" }, "source": [ "## Turn the two arrays back into JSONL files for Prodigy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RG78jgEOLWTr" }, "outputs": [], "source": [ "def array_to_jsonl(data_array, file_path):\n", " with open(file_path, 'w') as file:\n", " for item in data_array:\n", " json_line = json.dumps(item)\n", " file.write(json_line + '\\n')\n", "\n", "file_path = 'data/eval.jsonl'\n", "array_to_jsonl(eval_data, file_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "z_sgIHfyNFLm" }, "outputs": [], "source": [ "file_path = 'data/train.jsonl'\n", "array_to_jsonl(train_data, file_path)" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }