phoenix-byte / src /get_data.py
root
vector_db
9131d62
raw
history blame
5.19 kB
from bs4 import BeautifulSoup
import re
import requests
import json
import math
'''Script for scraping and formatting Supreme Court case summaries and court opinions from Justia since 2013'''
def remove_tags(text: str) -> str:
'''helper method'''
text = re.sub(r'\t|\r', '', text)
text = re.sub(r'\n', ' ', text)
text.encode('ascii', 'ignore').decode()
return text
def clean_soup(url, verbose=False):
'''Get docket no, court opinion, and justice opinions for a given case url'''
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html')
case_details = dict()
docket_no = soup.find_all(True, {'class':['flex-col', 'reset-width-below-tablet', 'item']})[0].find_all('span')[0].get_text()
# get summary. if none, return empty
case_summaries = soup.find_all('div', id='summary')
if len(case_summaries) == 0:
return docket_no, {'summary': 'Dismissed'}
case_summary = case_summaries[0].get_text()
case_details['summary'] = remove_tags(case_summary)
# look for per curiam opinion
per_curiam = soup.find_all(True, {'data-gtm-label':['Opinions Tab - Per Curiam']})
if len(per_curiam)>0:
opinion = soup.find_all('div', {'id':re.compile(r'tab-opinion-\d+')})[0]
opinion = remove_tags(opinion.get_text())
court_opinion_text = re.findall(r'(?<=Per Curiam.)[\w\W]+', opinion)[0].strip()
case_details['court_opinion'] = court_opinion_text
else:
for opinion in soup.find_all('div', {'id': re.compile(r'tab-opinion-')})[1:]:
opinion = remove_tags(opinion.get_text())
justice_name = re.findall(r'(?<=Justice\s)\w+', opinion)[0]
if verbose:
print(justice_name)
# get court opinion text
court_opinion_text = re.findall(r'(?<=delivered the opinion of the Court.)[\w\W]+', opinion)
if len(court_opinion_text) >0:
justice_opinion = court_opinion_text[0].strip()
case_details['court_opinion'] = justice_opinion
case_details[justice_name] = justice_opinion
if verbose:
print(justice_opinion)
else:
justice_opinion = re.findall(r'((?<=dissenting.)[\w\W]+|(?<=concurring.)[\w\W]+)', opinion)[0].strip()
if verbose:
print(justice_opinion)
case_details[justice_name] = justice_opinion
return docket_no, case_details
if __name__=="__main__":
# Scrape case_urls from justia
years = [f'https://supreme.justia.com/cases/federal/us/year/{i}.html' for i in range(2013, 2024)]
case_urls = []
for year_url in years:
r = requests.get(year_url)
soup = BeautifulSoup(r.content, 'html')
for i in soup.find_all(True, {'class':['color-green', 'text-soft-wrap']} ):
case_urls.append('https://supreme.justia.com' + i.a['href'])
# scrape cases, track errors
case_data = dict()
failed_urls = []
i = 1
for url in case_urls:
print(f'Scraping case {i}')
try:
docket_no, opinions = clean_soup(url)
assert opinions
case_data[docket_no] = opinions
except Exception as e:
print(f'Failed on url {url}')
failed_urls.append([url, str(e)])
i+=1
# Serialize case_data as json
json_object = json.dumps(case_data, indent=4)
# Writing to case_data.json
with open("case_data.json", "w") as outfile:
outfile.write(json_object)
# Create chunked dataset for hugging face dataloaders
data = []
current = 1
max_char = 4000
for k,v in case_data.items():
print(current)
current +=1
if v['summary'] != 'Dismissed':
summary = re.findall(r'(?<=Justia Summary\s\s\s)[\w\W]+', v['summary'])[0]
else:
continue
if v.get('court_opinion')==None:
continue
# remove notes from opinion
if re.search(r'[\w\W]+(?=Notes 1 \xa0)', v['court_opinion']):
court_opinion = re.findall(r'[\w\W]+(?=Notes 1 \xa0)', v['court_opinion'])[0]
if len(court_opinion) + len(summary) > max_char:
# chunk the opinions
max_len = max_char - len(summary)
chunk_size = int(max_len/ (math.ceil(max_len/ len(court_opinion))))
chunk_suffix = 1
for i in range(0, len(court_opinion), chunk_size):
chunk = court_opinion[i:i+chunk_size]
data.append({
'docket_no': k + str(chunk_suffix),
'summary': summary,
'opinion': chunk
})
chunk_suffix +=1
else:
data.append({
'docket_no': k,
'summary': summary,
'opinion': v.get('court_opinion', '')
})
dataloader_formatted = dict(
version='1.0',
data=data
)
json_object = json.dumps(dataloader_formatted, indent=4)
with open("chunked_case_data.json", "w") as outfile:
outfile.write(json_object)