Spaces:

noelfranthomas
/

test_transparencyAPI

Runtime error

App Files Files Community

test_transparencyAPI / web_scraper.py

noelfranthomas

NLP

01ef47d over 2 years ago

raw

history blame

13.8 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, parse_qs

	from utils import isValidURL
	from nlp import summarizer

	import json
	import sys
	import os

	# REQUIREMENTS: https://pixeltree.notion.site/City-Council-Scraping-34a2f5a24d59400faf9a128f2653ebf2
	# Meeting Minutes Directory: https://pub-calgary.escribemeetings.com

	# INPUT (arg 1): Valid URL pointing to meeting minutes. Needs to be wrapped in quotes
	# OPTIONAL INPUT (arg 2): Output directory
	# OUTPUT: JSON document containing required information scraped from input URL

	# Debug mode
	DEBUG = False

	if __name__ == "__main__":
	DEBUG = True

	def minutes_scraper(URL=""):
	if not isValidURL(URL):
	print("Invalid or missing URL input")
	print("Please enter a URL now:")

	return "Invalid URL"

	# Get output directory
	out_dir = ""
	out_dir = os.getcwd()

	###

	s = summarizer() # Summarizer object

	# Object to be seriliazed
	JSON_obj = {}

	# Get meeting ID
	page = requests.get(URL)
	o = urlparse(URL)
	query = parse_qs(o.query)

	JSON_obj["meeting_id"] = query["Id"][0]

	# Complete HTML File
	soup = BeautifulSoup(page.content, "html.parser")

	# Most of the page content is found in this container
	page_content = soup.find(id="package-container")

	###

	# MM Header
	agenda_header = page_content.find("header", class_="AgendaHeader")

	## Header information

	# Get the Agenda Header
	try:
	JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text
	except AttributeError:
	JSON_obj["agenda_header_subtitle"] = ""


	# Get the start time
	JSON_obj["start_time"] = agenda_header.find("time").text

	# Get the location
	try:
	JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info
	except AttributeError:
	JSON_obj["location"] = ""

	# Get the attendence (seperated by who can and can't vote)
	attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div")
	try:
	present = [x.text for x in attendance_table[2].find_all("li")]
	except IndexError:
	present = []

	try:
	also_present = [x.text for x in attendance_table[5].find_all("li")]
	except IndexError:
	also_present = []

	JSON_obj["attendance"] = {'present': present, 'also_present': also_present}

	###

	# MM Body
	agenda_items = page_content.find("div", class_="AgendaItems")

	## Body information

	# Get item containers
	agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent")

	# Get roll call
	try:
	roll_call = agenda_item_containers[0].find_all("p", class_="Body1")
	JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ')
	except IndexError:
	JSON_obj["roll_call"] = []

	if DEBUG:
	print(JSON_obj["roll_call"])

	# Get generator of item containers
	agenda_item_containers = agenda_items.children

	item_number = 1
	for agenda_item in agenda_item_containers:

	# Get titles
	titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")]

	# Get each motion in each item
	motions = agenda_item.find_all("ul", class_="AgendaItemMotions")

	if DEBUG:
	print(item_number)

	if motions != None:
	item_sub_number = 1

	for motion in motions:

	# Dictionary to store all motion info
	motion_obj = {}

	if DEBUG:
	print(str(item_number) + '.' + str(item_sub_number))

	# Place "anchor"
	motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")]

	# Get motion title
	motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor]

	# Get list of who the motion is moved by
	moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")]
	moved_by_list = [x.text for x in moved_by_list]

	# Get motion description
	motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")]

	# Get motion result
	motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")]

	# Get motion votes
	motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")]

	# Get motion attachments
	motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor]
	motion_attachments_list_names = []
	motion_attachments_list_links = []
	for x in motion_attachments_list:
	motion_attachments_list_names.append([y.text for y in x]) # ?
	motion_attachments_list_links.append([y['href'] for y in x])

	motion_obj["titles"] = motion_titles
	motion_obj["moved_by"] = moved_by_list
	motion_obj["details"] = motion_description_list
	motion_obj["results"] = motion_result_list
	motion_obj["votes"] = motion_votes_list
	motion_obj['attachment_names'] = motion_attachments_list_names[0]
	motion_obj['attachment_links'] = motion_attachments_list_links[0]
	motion_obj['attachment_count'] = len(motion_attachments_list_names[0])

	for desc in motion_description_list:
	if len(desc.split()) > s.max_length:
	motion_obj['summary'] = s.summarize(text=desc)[0]
	else:
	motion_obj['summary'] = "Too short to summarize"


	if DEBUG:
	print(str(item_number) + '.' + str(item_sub_number))
	print(motion_titles) # title
	print("Moved by: " + str(moved_by_list)) # Moved by
	print(motion_description_list) # Other details
	print("Result: " + str(motion_result_list)) # Result
	print("Votes: " + str(motion_votes_list)) # Votes
	print(motion_attachments_list_names[0]) # attachment names
	print(motion_attachments_list_links[0]) # attachment links
	print()

	# Append to JSON object
	JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj

	item_sub_number+=1

	if DEBUG:
	print('-----------------------------------\n\n\n')

	item_number+=1





	# # Serialize and write to "meeting_minutes.json"
	# with open(f"{out_dir}/meeting_minutes.json", "w") as out:
	# json.dump(JSON_obj, out, indent=4)

	# Add this to data base

	return JSON_obj

	def minutes_scraper_no_sum(URL=""):
	if not isValidURL(URL):
	print("Invalid or missing URL input")
	print("Please enter a URL now:")

	return "Invalid URL"

	# Get output directory
	out_dir = ""
	out_dir = os.getcwd()

	###

	s = summarizer() # Summarizer object

	# Object to be seriliazed
	JSON_obj = {}

	# Get meeting ID
	page = requests.get(URL)
	o = urlparse(URL)
	query = parse_qs(o.query)

	JSON_obj["meeting_id"] = query["Id"][0]

	# Complete HTML File
	soup = BeautifulSoup(page.content, "html.parser")

	# Most of the page content is found in this container
	page_content = soup.find(id="package-container")

	###

	# MM Header
	agenda_header = page_content.find("header", class_="AgendaHeader")

	## Header information

	# Get the Agenda Header
	try:
	JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text
	except AttributeError:
	JSON_obj["agenda_header_subtitle"] = ""


	# Get the start time
	JSON_obj["start_time"] = agenda_header.find("time").text

	# Get the location
	try:
	JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info
	except AttributeError:
	JSON_obj["location"] = ""

	# Get the attendence (seperated by who can and can't vote)
	attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div")
	try:
	present = [x.text for x in attendance_table[2].find_all("li")]
	except IndexError:
	present = []

	try:
	also_present = [x.text for x in attendance_table[5].find_all("li")]
	except IndexError:
	also_present = []

	JSON_obj["attendance"] = {'present': present, 'also_present': also_present}

	###

	# MM Body
	agenda_items = page_content.find("div", class_="AgendaItems")

	## Body information

	# Get item containers
	agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent")

	# Get roll call
	try:
	roll_call = agenda_item_containers[0].find_all("p", class_="Body1")
	JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ')
	except IndexError:
	JSON_obj["roll_call"] = []

	if DEBUG:
	print(JSON_obj["roll_call"])

	# Get generator of item containers
	agenda_item_containers = agenda_items.children

	item_number = 1
	for agenda_item in agenda_item_containers:

	# Get titles
	titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")]

	# Get each motion in each item
	motions = agenda_item.find_all("ul", class_="AgendaItemMotions")

	if DEBUG:
	print(item_number)

	if motions != None:
	item_sub_number = 1

	for motion in motions:

	# Dictionary to store all motion info
	motion_obj = {}

	if DEBUG:
	print(str(item_number) + '.' + str(item_sub_number))

	# Place "anchor"
	motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")]

	# Get motion title
	motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor]

	# Get list of who the motion is moved by
	moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")]
	moved_by_list = [x.text for x in moved_by_list]

	# Get motion description
	motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")]

	# Get motion result
	motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")]

	# Get motion votes
	motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")]

	# Get motion attachments
	motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor]
	motion_attachments_list_names = []
	motion_attachments_list_links = []
	for x in motion_attachments_list:
	motion_attachments_list_names.append([y.text for y in x]) # ?
	motion_attachments_list_links.append([y['href'] for y in x])

	motion_obj["titles"] = motion_titles
	motion_obj["moved_by"] = moved_by_list
	motion_obj["details"] = motion_description_list
	motion_obj["results"] = motion_result_list
	motion_obj["votes"] = motion_votes_list
	motion_obj['attachment_names'] = motion_attachments_list_names[0]
	motion_obj['attachment_links'] = motion_attachments_list_links[0]
	motion_obj['attachment_count'] = len(motion_attachments_list_names[0])

	# for desc in motion_description_list:
	# if len(desc.split()) > s.max_length:
	# motion_obj['summary'] = s.summarize(text=desc)[0]
	# else:
	# motion_obj['summary'] = "Too short to summarize"


	if DEBUG:
	print(str(item_number) + '.' + str(item_sub_number))
	print(motion_titles) # title
	print("Moved by: " + str(moved_by_list)) # Moved by
	print(motion_description_list) # Other details
	print("Result: " + str(motion_result_list)) # Result
	print("Votes: " + str(motion_votes_list)) # Votes
	print(motion_attachments_list_names[0]) # attachment names
	print(motion_attachments_list_links[0]) # attachment links
	print()

	# Append to JSON object
	JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj

	item_sub_number+=1

	if DEBUG:
	print('-----------------------------------\n\n\n')

	item_number+=1





	# # Serialize and write to "meeting_minutes.json"
	# with open(f"{out_dir}/meeting_minutes.json", "w") as out:
	# json.dump(JSON_obj, out, indent=4)

	# Add this to data base

	return JSON_obj