Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse, parse_qs | |
from utils import isValidURL | |
from nlp import summarizer | |
import json | |
import sys | |
import os | |
# REQUIREMENTS: https://pixeltree.notion.site/City-Council-Scraping-34a2f5a24d59400faf9a128f2653ebf2 | |
# Meeting Minutes Directory: https://pub-calgary.escribemeetings.com | |
# INPUT (arg 1): Valid URL pointing to meeting minutes. Needs to be wrapped in quotes | |
# OPTIONAL INPUT (arg 2): Output directory | |
# OUTPUT: JSON document containing required information scraped from input URL | |
# Debug mode | |
DEBUG = False | |
if __name__ == "__main__": | |
DEBUG = True | |
def minutes_scraper(URL=""): | |
if not isValidURL(URL): | |
print("Invalid or missing URL input") | |
print("Please enter a URL now:") | |
return "Invalid URL" | |
# Get output directory | |
out_dir = "" | |
out_dir = os.getcwd() | |
### | |
s = summarizer() # Summarizer object | |
# Object to be seriliazed | |
JSON_obj = {} | |
# Get meeting ID | |
page = requests.get(URL) | |
o = urlparse(URL) | |
query = parse_qs(o.query) | |
JSON_obj["meeting_id"] = query["Id"][0] | |
# Complete HTML File | |
soup = BeautifulSoup(page.content, "html.parser") | |
# Most of the page content is found in this container | |
page_content = soup.find(id="package-container") | |
### | |
# MM Header | |
agenda_header = page_content.find("header", class_="AgendaHeader") | |
## Header information | |
# Get the Agenda Header | |
try: | |
JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text | |
except AttributeError: | |
JSON_obj["agenda_header_subtitle"] = "" | |
# Get the start time | |
JSON_obj["start_time"] = agenda_header.find("time").text | |
# Get the location | |
try: | |
JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info | |
except AttributeError: | |
JSON_obj["location"] = "" | |
# Get the attendence (seperated by who can and can't vote) | |
attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div") | |
try: | |
present = [x.text for x in attendance_table[2].find_all("li")] | |
except IndexError: | |
present = [] | |
try: | |
also_present = [x.text for x in attendance_table[5].find_all("li")] | |
except IndexError: | |
also_present = [] | |
JSON_obj["attendance"] = {'present': present, 'also_present': also_present} | |
### | |
# MM Body | |
agenda_items = page_content.find("div", class_="AgendaItems") | |
## Body information | |
# Get item containers | |
agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent") | |
# Get roll call | |
try: | |
roll_call = agenda_item_containers[0].find_all("p", class_="Body1") | |
JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ') | |
except IndexError: | |
JSON_obj["roll_call"] = [] | |
if DEBUG: | |
print(JSON_obj["roll_call"]) | |
# Get generator of item containers | |
agenda_item_containers = agenda_items.children | |
item_number = 1 | |
for agenda_item in agenda_item_containers: | |
# Get titles | |
titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")] | |
# Get each motion in each item | |
motions = agenda_item.find_all("ul", class_="AgendaItemMotions") | |
if DEBUG: | |
print(item_number) | |
if motions != None: | |
item_sub_number = 1 | |
for motion in motions: | |
# Dictionary to store all motion info | |
motion_obj = {} | |
if DEBUG: | |
print(str(item_number) + '.' + str(item_sub_number)) | |
# Place "anchor" | |
motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")] | |
# Get motion title | |
motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor] | |
# Get list of who the motion is moved by | |
moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")] | |
moved_by_list = [x.text for x in moved_by_list] | |
# Get motion description | |
motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")] | |
# Get motion result | |
motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")] | |
# Get motion votes | |
motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")] | |
# Get motion attachments | |
motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor] | |
motion_attachments_list_names = [] | |
motion_attachments_list_links = [] | |
for x in motion_attachments_list: | |
motion_attachments_list_names.append([y.text for y in x]) # ? | |
motion_attachments_list_links.append([y['href'] for y in x]) | |
motion_obj["titles"] = motion_titles | |
motion_obj["moved_by"] = moved_by_list | |
motion_obj["details"] = motion_description_list | |
motion_obj["results"] = motion_result_list | |
motion_obj["votes"] = motion_votes_list | |
motion_obj['attachment_names'] = motion_attachments_list_names[0] | |
motion_obj['attachment_links'] = motion_attachments_list_links[0] | |
motion_obj['attachment_count'] = len(motion_attachments_list_names[0]) | |
for desc in motion_description_list: | |
if len(desc.split()) > s.max_length: | |
motion_obj['summary'] = s.summarize(text=desc)[0] | |
else: | |
motion_obj['summary'] = "Too short to summarize" | |
if DEBUG: | |
print(str(item_number) + '.' + str(item_sub_number)) | |
print(motion_titles) # title | |
print("Moved by: " + str(moved_by_list)) # Moved by | |
print(motion_description_list) # Other details | |
print("Result: " + str(motion_result_list)) # Result | |
print("Votes: " + str(motion_votes_list)) # Votes | |
print(motion_attachments_list_names[0]) # attachment names | |
print(motion_attachments_list_links[0]) # attachment links | |
print() | |
# Append to JSON object | |
JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj | |
item_sub_number+=1 | |
if DEBUG: | |
print('-----------------------------------\n\n\n') | |
item_number+=1 | |
# # Serialize and write to "meeting_minutes.json" | |
# with open(f"{out_dir}/meeting_minutes.json", "w") as out: | |
# json.dump(JSON_obj, out, indent=4) | |
# Add this to data base | |
return JSON_obj | |
def minutes_scraper_no_sum(URL=""): | |
if not isValidURL(URL): | |
print("Invalid or missing URL input") | |
print("Please enter a URL now:") | |
return "Invalid URL" | |
# Get output directory | |
out_dir = "" | |
out_dir = os.getcwd() | |
### | |
s = summarizer() # Summarizer object | |
# Object to be seriliazed | |
JSON_obj = {} | |
# Get meeting ID | |
page = requests.get(URL) | |
o = urlparse(URL) | |
query = parse_qs(o.query) | |
JSON_obj["meeting_id"] = query["Id"][0] | |
# Complete HTML File | |
soup = BeautifulSoup(page.content, "html.parser") | |
# Most of the page content is found in this container | |
page_content = soup.find(id="package-container") | |
### | |
# MM Header | |
agenda_header = page_content.find("header", class_="AgendaHeader") | |
## Header information | |
# Get the Agenda Header | |
try: | |
JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text | |
except AttributeError: | |
JSON_obj["agenda_header_subtitle"] = "" | |
# Get the start time | |
JSON_obj["start_time"] = agenda_header.find("time").text | |
# Get the location | |
try: | |
JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info | |
except AttributeError: | |
JSON_obj["location"] = "" | |
# Get the attendence (seperated by who can and can't vote) | |
attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div") | |
try: | |
present = [x.text for x in attendance_table[2].find_all("li")] | |
except IndexError: | |
present = [] | |
try: | |
also_present = [x.text for x in attendance_table[5].find_all("li")] | |
except IndexError: | |
also_present = [] | |
JSON_obj["attendance"] = {'present': present, 'also_present': also_present} | |
### | |
# MM Body | |
agenda_items = page_content.find("div", class_="AgendaItems") | |
## Body information | |
# Get item containers | |
agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent") | |
# Get roll call | |
try: | |
roll_call = agenda_item_containers[0].find_all("p", class_="Body1") | |
JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ') | |
except IndexError: | |
JSON_obj["roll_call"] = [] | |
if DEBUG: | |
print(JSON_obj["roll_call"]) | |
# Get generator of item containers | |
agenda_item_containers = agenda_items.children | |
item_number = 1 | |
for agenda_item in agenda_item_containers: | |
# Get titles | |
titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")] | |
# Get each motion in each item | |
motions = agenda_item.find_all("ul", class_="AgendaItemMotions") | |
if DEBUG: | |
print(item_number) | |
if motions != None: | |
item_sub_number = 1 | |
for motion in motions: | |
# Dictionary to store all motion info | |
motion_obj = {} | |
if DEBUG: | |
print(str(item_number) + '.' + str(item_sub_number)) | |
# Place "anchor" | |
motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")] | |
# Get motion title | |
motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor] | |
# Get list of who the motion is moved by | |
moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")] | |
moved_by_list = [x.text for x in moved_by_list] | |
# Get motion description | |
motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")] | |
# Get motion result | |
motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")] | |
# Get motion votes | |
motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")] | |
# Get motion attachments | |
motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor] | |
motion_attachments_list_names = [] | |
motion_attachments_list_links = [] | |
for x in motion_attachments_list: | |
motion_attachments_list_names.append([y.text for y in x]) # ? | |
motion_attachments_list_links.append([y['href'] for y in x]) | |
motion_obj["titles"] = motion_titles | |
motion_obj["moved_by"] = moved_by_list | |
motion_obj["details"] = motion_description_list | |
motion_obj["results"] = motion_result_list | |
motion_obj["votes"] = motion_votes_list | |
motion_obj['attachment_names'] = motion_attachments_list_names[0] | |
motion_obj['attachment_links'] = motion_attachments_list_links[0] | |
motion_obj['attachment_count'] = len(motion_attachments_list_names[0]) | |
# for desc in motion_description_list: | |
# if len(desc.split()) > s.max_length: | |
# motion_obj['summary'] = s.summarize(text=desc)[0] | |
# else: | |
# motion_obj['summary'] = "Too short to summarize" | |
if DEBUG: | |
print(str(item_number) + '.' + str(item_sub_number)) | |
print(motion_titles) # title | |
print("Moved by: " + str(moved_by_list)) # Moved by | |
print(motion_description_list) # Other details | |
print("Result: " + str(motion_result_list)) # Result | |
print("Votes: " + str(motion_votes_list)) # Votes | |
print(motion_attachments_list_names[0]) # attachment names | |
print(motion_attachments_list_links[0]) # attachment links | |
print() | |
# Append to JSON object | |
JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj | |
item_sub_number+=1 | |
if DEBUG: | |
print('-----------------------------------\n\n\n') | |
item_number+=1 | |
# # Serialize and write to "meeting_minutes.json" | |
# with open(f"{out_dir}/meeting_minutes.json", "w") as out: | |
# json.dump(JSON_obj, out, indent=4) | |
# Add this to data base | |
return JSON_obj |