{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e66bbb77-71f5-4d80-b766-f67144ea7a93",
   "metadata": {},
   "source": [
    "# Data Records\n",
    "\n",
    "## This notebook generates the data_records.json file where each entry in the resulting dictionary follows the form {filename: num_records} for every dataset we will use during training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "74ad6613-44ff-435e-8550-df993e915677",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# import relevant libraries\n",
    "import os\n",
    "import boto3\n",
    "import json\n",
    "from smart_open import open"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2d53761-da0e-44f4-8a3e-1285bf810b03",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "s3 = boto3.resource('s3')\n",
    "my_bucket = s3.Bucket('lodestone-rnd')\n",
    "\n",
    "# collect all filenames from the data/ directory of the lodestone-rnd S3 bucket\n",
    "files = [\"\"]*((621+12+9+36)+1)\n",
    "for i, object_summary in enumerate(my_bucket.objects.filter(Prefix=\"data/\")):\n",
    "    files[i] = object_summary.key[5:]\n",
    "files = files[1:]\n",
    "files = [file for file in files if file != 'cnn_dailymail_splitted.json.gz']\n",
    "\n",
    "s3_client = boto3.client(\"s3\")\n",
    "\n",
    "# for each training dataset, store the number of records in a dictionary with the following form {filename: num_records}\n",
    "data_lengths = {}\n",
    "for file in files:\n",
    "    source_uri = f's3://lodestone-rnd/data/{file}'\n",
    "    # S2ORC_citations_abstracts.json.gz and amazon-qa.json.gz must be handled differently since each line in their training\n",
    "    # data is split into multiple records due to the fact that each query has multiple positive pair responses\n",
    "    if file in ['S2ORC_citations_abstracts.json.gz','amazon-qa.json.gz']:\n",
    "        length = 0\n",
    "        for json_line in open(source_uri, transport_params={\"client\": s3_client}):\n",
    "            data = json.loads(json_line.strip())\n",
    "            length += len(data['pos'])\n",
    "    else:\n",
    "        length = int(os.popen(f'aws s3 cp {source_uri} - | zcat | wc -l').read().rstrip())\n",
    "    data_lengths[f'{file}'] = length\n",
    "    \n",
    "# write the resulting dictionary to a .json file for future use during training\n",
    "with open('data_records.json', 'w') as fileout:\n",
    "    json.dump(data_lengths, fileout)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_pytorch_p310",
   "language": "python",
   "name": "conda_pytorch_p310"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}