Spaces:
Sleeping
Sleeping
File size: 5,190 Bytes
d8cec1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from bs4 import BeautifulSoup
import re
import requests
import json
import math
'''Script for scraping and formatting Supreme Court case summaries and court opinions from Justia since 2013'''
def remove_tags(text: str) -> str:
'''helper method'''
text = re.sub(r'\t|\r', '', text)
text = re.sub(r'\n', ' ', text)
text.encode('ascii', 'ignore').decode()
return text
def clean_soup(url, verbose=False):
'''Get docket no, court opinion, and justice opinions for a given case url'''
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html')
case_details = dict()
docket_no = soup.find_all(True, {'class':['flex-col', 'reset-width-below-tablet', 'item']})[0].find_all('span')[0].get_text()
# get summary. if none, return empty
case_summaries = soup.find_all('div', id='summary')
if len(case_summaries) == 0:
return docket_no, {'summary': 'Dismissed'}
case_summary = case_summaries[0].get_text()
case_details['summary'] = remove_tags(case_summary)
# look for per curiam opinion
per_curiam = soup.find_all(True, {'data-gtm-label':['Opinions Tab - Per Curiam']})
if len(per_curiam)>0:
opinion = soup.find_all('div', {'id':re.compile(r'tab-opinion-\d+')})[0]
opinion = remove_tags(opinion.get_text())
court_opinion_text = re.findall(r'(?<=Per Curiam.)[\w\W]+', opinion)[0].strip()
case_details['court_opinion'] = court_opinion_text
else:
for opinion in soup.find_all('div', {'id': re.compile(r'tab-opinion-')})[1:]:
opinion = remove_tags(opinion.get_text())
justice_name = re.findall(r'(?<=Justice\s)\w+', opinion)[0]
if verbose:
print(justice_name)
# get court opinion text
court_opinion_text = re.findall(r'(?<=delivered the opinion of the Court.)[\w\W]+', opinion)
if len(court_opinion_text) >0:
justice_opinion = court_opinion_text[0].strip()
case_details['court_opinion'] = justice_opinion
case_details[justice_name] = justice_opinion
if verbose:
print(justice_opinion)
else:
justice_opinion = re.findall(r'((?<=dissenting.)[\w\W]+|(?<=concurring.)[\w\W]+)', opinion)[0].strip()
if verbose:
print(justice_opinion)
case_details[justice_name] = justice_opinion
return docket_no, case_details
if __name__=="__main__":
# Scrape case_urls from justia
years = [f'https://supreme.justia.com/cases/federal/us/year/{i}.html' for i in range(2013, 2024)]
case_urls = []
for year_url in years:
r = requests.get(year_url)
soup = BeautifulSoup(r.content, 'html')
for i in soup.find_all(True, {'class':['color-green', 'text-soft-wrap']} ):
case_urls.append('https://supreme.justia.com' + i.a['href'])
# scrape cases, track errors
case_data = dict()
failed_urls = []
i = 1
for url in case_urls:
print(f'Scraping case {i}')
try:
docket_no, opinions = clean_soup(url)
assert opinions
case_data[docket_no] = opinions
except Exception as e:
print(f'Failed on url {url}')
failed_urls.append([url, str(e)])
i+=1
# Serialize case_data as json
json_object = json.dumps(case_data, indent=4)
# Writing to case_data.json
with open("case_data.json", "w") as outfile:
outfile.write(json_object)
# Create chunked dataset for hugging face dataloaders
data = []
current = 1
max_char = 4000
for k,v in case_data.items():
print(current)
current +=1
if v['summary'] != 'Dismissed':
summary = re.findall(r'(?<=Justia Summary\s\s\s)[\w\W]+', v['summary'])[0]
else:
continue
if v.get('court_opinion')==None:
continue
# remove notes from opinion
if re.search(r'[\w\W]+(?=Notes 1 \xa0)', v['court_opinion']):
court_opinion = re.findall(r'[\w\W]+(?=Notes 1 \xa0)', v['court_opinion'])[0]
if len(court_opinion) + len(summary) > max_char:
# chunk the opinions
max_len = max_char - len(summary)
chunk_size = int(max_len/ (math.ceil(max_len/ len(court_opinion))))
chunk_suffix = 1
for i in range(0, len(court_opinion), chunk_size):
chunk = court_opinion[i:i+chunk_size]
data.append({
'docket_no': k + str(chunk_suffix),
'summary': summary,
'opinion': chunk
})
chunk_suffix +=1
else:
data.append({
'docket_no': k,
'summary': summary,
'opinion': v.get('court_opinion', '')
})
dataloader_formatted = dict(
version='1.0',
data=data
)
json_object = json.dumps(dataloader_formatted, indent=4)
with open("chunked_case_data.json", "w") as outfile:
outfile.write(json_object)
|