|
import boto3 |
|
import os |
|
import pandas as pd |
|
import json |
|
|
|
|
|
|
|
def upload_files(origin_path, destination_path, aws_access_key, aws_secret_key): |
|
session = boto3.Session(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) |
|
s3 = session.resource('s3') |
|
bucket = s3.Bucket('gideon-corpus') |
|
|
|
for subdir, dirs, files in os.walk(origin_path): |
|
for file in files: |
|
full_path = os.path.join(subdir, file) |
|
with open(full_path, 'rb') as data: |
|
bucket.put_object(Key=destination_path + origin_path.split('/')[-1] + '/' + full_path[len(origin_path) + 1:], Body=data) |
|
|
|
|
|
def retrieve_logs(aws_access_key, aws_secret_key): |
|
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) |
|
logs_response = s3.get_object(Bucket='gideon-corpus', Key='logs/logs.csv') |
|
logs_df = pd.read_csv(logs_response['Body']) |
|
return logs_df |
|
|
|
def retrieve_casedocs(case_num, aws_access_key, aws_secret_key): |
|
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) |
|
opinions_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/opinions.csv') |
|
opinions_df = pd.read_csv(opinions_response['Body']) |
|
metadata_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/metadata.json') |
|
metadata = json.loads(metadata_response['Body'].read().decode('utf-8')) |
|
return opinions_df, metadata |
|
|
|
|
|
def retrieve_all_casedocs(prefix, aws_access_key, aws_secret_key): |
|
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) |
|
subdirectories = set() |
|
paginator = s3.get_paginator('list_objects_v2') |
|
for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'): |
|
if result.get('CommonPrefixes'): |
|
subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes')) |
|
subdirectories = list(subdirectories) |
|
subs = [s.split('/')[1] for s in subdirectories] |
|
|
|
casedocs = [] |
|
for s in subs: |
|
opinions_df, metadata = retrieve_casedocs(s, aws_access_key, aws_secret_key) |
|
casedocs.append((opinions_df, metadata)) |
|
return casedocs |
|
|