# Data Records

## This notebook generates the data_records.json file where each entry in the resulting dictionary follows the form {filename: num_records} for every dataset we will use during training

In [39]:
# import relevant libraries
import os
import boto3
import json
from smart_open import open

In [None]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('lodestone-rnd')

# collect all filenames from the data/ directory of the lodestone-rnd S3 bucket
files = [""]*((621+12+9+36)+1)
for i, object_summary in enumerate(my_bucket.objects.filter(Prefix="data/")):
 files[i] = object_summary.key[5:]
files = files[1:]
files = [file for file in files if file != 'cnn_dailymail_splitted.json.gz']

s3_client = boto3.client("s3")

# for each training dataset, store the number of records in a dictionary with the following form {filename: num_records}
data_lengths = {}
for file in files:
 source_uri = f's3://lodestone-rnd/data/{file}'
 # S2ORC_citations_abstracts.json.gz and amazon-qa.json.gz must be handled differently since each line in their training
 # data is split into multiple records due to the fact that each query has multiple positive pair responses
 if file in ['S2ORC_citations_abstracts.json.gz','amazon-qa.json.gz']:
 length = 0
 for json_line in open(source_uri, transport_params={"client": s3_client}):
 data = json.loads(json_line.strip())
 length += len(data['pos'])
 else:
 length = int(os.popen(f'aws s3 cp {source_uri} - | zcat | wc -l').read().rstrip())
 data_lengths[f'{file}'] = length
 
# write the resulting dictionary to a .json file for future use during training
with open('data_records.json', 'w') as fileout:
 json.dump(data_lengths, fileout)