{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cPWaiNGVZfB1"
      },
      "source": [
        "## Get the JSONL file from github repo"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kwTs86SwToom",
        "outputId": "f5562864-a61d-4ff6-f89a-553047685468"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "--2024-03-29 17:12:50--  https://raw.githubusercontent.com/DagimB/Text_Mining_ecfr_title12/main/ecfr-title-12.jsonl\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 17239826 (16M) [text/plain]\n",
            "Saving to: ‘ecfr-title-12.jsonl’\n",
            "\n",
            "ecfr-title-12.jsonl 100%[===================>]  16.44M  96.6MB/s    in 0.2s    \n",
            "\n",
            "2024-03-29 17:12:50 (96.6 MB/s) - ‘ecfr-title-12.jsonl’ saved [17239826/17239826]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# Accessing the title 12 JSONL file\n",
        "\n",
        "!wget https://raw.githubusercontent.com/ManjinderUNCC/prodigy-ecfr-textcat/main/data/ecfr-title-12.jsonl"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4nZE63owa0Q5",
        "outputId": "3a20455c-6f65-4da2-af36-ce9a13e507c1"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "\n",
        "# Mount Google Drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Bipzb8pwcm5T"
      },
      "source": [
        "## Convert JSONL file into an array"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vvQtsxxxZQs9"
      },
      "outputs": [],
      "source": [
        "# deletes the first line of the jsonl file and saves the rest as output.jsonl\n",
        "\n",
        "import json\n",
        "\n",
        "with open('ecfr-title-12.jsonl', 'r', encoding='utf-8') as infile:\n",
        "        next(infile)\n",
        "        with open('/content/drive/My Drive/output.jsonl', 'w', encoding='utf-8') as outfile:\n",
        "          for line in infile:\n",
        "            outfile.write(line)\n",
        "\n",
        "file_path = '/content/drive/My Drive/output.jsonl'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dDbDFyUdcmIr"
      },
      "outputs": [],
      "source": [
        "def jsonl_to_array(file_path):\n",
        "    data = []\n",
        "    with open(file_path, 'r') as file:\n",
        "        for line in file:\n",
        "            # Strip any leading/trailing whitespace and parse JSON\n",
        "            json_data = json.loads(line.strip())\n",
        "            data.append(json_data)\n",
        "    return data\n",
        "\n",
        "data_array = jsonl_to_array(file_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9njD4a2ee7xl"
      },
      "source": [
        "## Split array into train and evaluation using train_test_split"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "x3rGk_ISfPUs",
        "outputId": "fb1ec5d4-5fd7-4ff4-e0b5-175fb16b6c31"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "4665"
            ]
          },
          "execution_count": 5,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# see how long the array is\n",
        "len(data_array)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "isJmBgRGe2rc",
        "outputId": "13f453a7-4c1b-4527-c334-0e02c09caf52"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Length of eval dataset: 200\n"
          ]
        }
      ],
      "source": [
        "# Use train_test_split to get an evaluation dataset that is 200 in length\n",
        "\n",
        "from sklearn.model_selection import train_test_split\n",
        "train_data, eval_data = train_test_split(data_array, test_size=0.0428, random_state=42)\n",
        "\n",
        "print(f\"Length of eval dataset: {len(eval_data)}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "F8Rjwp8Bh3EP",
        "outputId": "c2831246-fb42-41b5-aba7-10f99c50fee7"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "4465"
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "len(train_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mIggR8P2LHLl"
      },
      "source": [
        "## Turn the two arrays back into JSONL files for Prodigy"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RG78jgEOLWTr"
      },
      "outputs": [],
      "source": [
        "def array_to_jsonl(data_array, file_path):\n",
        "    with open(file_path, 'w') as file:\n",
        "        for item in data_array:\n",
        "            json_line = json.dumps(item)\n",
        "            file.write(json_line + '\\n')\n",
        "\n",
        "file_path = 'data/eval.jsonl'\n",
        "array_to_jsonl(eval_data, file_path)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "z_sgIHfyNFLm"
      },
      "outputs": [],
      "source": [
        "file_path = 'data/train.jsonl'\n",
        "array_to_jsonl(train_data, file_path)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}