Gideon / AWSHandler.py
cools's picture
Upload AWSHandler.py
aeed5b8
import boto3
import os
import pandas as pd
import json
def upload_files(origin_path, destination_path, aws_access_key, aws_secret_key):
session = boto3.Session(aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
s3 = session.resource('s3')
bucket = s3.Bucket('gideon-corpus')
for subdir, dirs, files in os.walk(origin_path):
for file in files:
full_path = os.path.join(subdir, file)
with open(full_path, 'rb') as data:
bucket.put_object(Key=destination_path + origin_path.split('/')[-1] + '/' + full_path[len(origin_path) + 1:], Body=data)
def retrieve_logs(aws_access_key, aws_secret_key):
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
logs_response = s3.get_object(Bucket='gideon-corpus', Key='logs/logs.csv')
logs_df = pd.read_csv(logs_response['Body'])
return logs_df
def retrieve_casedocs(case_num, aws_access_key, aws_secret_key): # Note: this is how stuff is stored on AWS
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
opinions_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/opinions.csv')
opinions_df = pd.read_csv(opinions_response['Body'])
metadata_response = s3.get_object(Bucket='gideon-corpus', Key='Cases/' + case_num + '/metadata.json')
metadata = json.loads(metadata_response['Body'].read().decode('utf-8'))
return opinions_df, metadata
def retrieve_all_casedocs(prefix, aws_access_key, aws_secret_key):
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
subdirectories = set()
paginator = s3.get_paginator('list_objects_v2')
for result in paginator.paginate(Bucket="gideon-corpus", Prefix=prefix, Delimiter='/'):
if result.get('CommonPrefixes'):
subdirectories.update(subdir.get('Prefix') for subdir in result.get('CommonPrefixes'))
subdirectories = list(subdirectories)
subs = [s.split('/')[1] for s in subdirectories]
casedocs = []
for s in subs:
opinions_df, metadata = retrieve_casedocs(s, aws_access_key, aws_secret_key)
casedocs.append((opinions_df, metadata))
return casedocs