Spaces:

noelfranthomas
/

test_transparencyAPI

Runtime error

File size: 13,756 Bytes

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs

from utils import isValidURL
from nlp import summarizer

import json
import sys
import os

# REQUIREMENTS: https://pixeltree.notion.site/City-Council-Scraping-34a2f5a24d59400faf9a128f2653ebf2
# Meeting Minutes Directory: https://pub-calgary.escribemeetings.com

# INPUT (arg 1): Valid URL pointing to meeting minutes. Needs to be wrapped in quotes
# OPTIONAL INPUT (arg 2): Output directory
# OUTPUT: JSON document containing required information scraped from input URL

# Debug mode
DEBUG = False

if __name__ == "__main__":
    DEBUG = True

def minutes_scraper(URL=""):
    if not isValidURL(URL):
        print("Invalid or missing URL input")
        print("Please enter a URL now:")

        return "Invalid URL"

    # Get output directory
    out_dir = ""
    out_dir = os.getcwd()

    ###

    s = summarizer() # Summarizer object

    # Object to be seriliazed
    JSON_obj = {}

    # Get meeting ID
    page = requests.get(URL)
    o = urlparse(URL)
    query = parse_qs(o.query)

    JSON_obj["meeting_id"] = query["Id"][0]

    # Complete HTML File
    soup = BeautifulSoup(page.content, "html.parser")

    # Most of the page content is found in this container
    page_content = soup.find(id="package-container")

    ###

    # MM Header
    agenda_header = page_content.find("header", class_="AgendaHeader")

    ## Header information

    # Get the Agenda Header
    try: 
        JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text
    except AttributeError:
        JSON_obj["agenda_header_subtitle"] = ""


    # Get the start time
    JSON_obj["start_time"] = agenda_header.find("time").text

    # Get the location
    try:
        JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info
    except AttributeError:
        JSON_obj["location"] = ""

    # Get the attendence (seperated by who can and can't vote)
    attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div")
    try:
        present = [x.text for x in attendance_table[2].find_all("li")]
    except IndexError:
        present = []

    try:
        also_present = [x.text for x in attendance_table[5].find_all("li")]
    except IndexError:
        also_present = []

    JSON_obj["attendance"] = {'present': present, 'also_present': also_present}

    ###

    # MM Body
    agenda_items = page_content.find("div", class_="AgendaItems")

    ## Body information

    # Get item containers
    agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent")

    # Get roll call
    try:
        roll_call = agenda_item_containers[0].find_all("p", class_="Body1")
        JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ')
    except IndexError:
        JSON_obj["roll_call"] = []

    if DEBUG:
        print(JSON_obj["roll_call"])

    # Get generator of item containers
    agenda_item_containers = agenda_items.children

    item_number = 1
    for agenda_item in agenda_item_containers:

        # Get titles
        titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")]
        
        # Get each motion in each item
        motions = agenda_item.find_all("ul", class_="AgendaItemMotions")

        if DEBUG:
            print(item_number)

        if motions != None:
            item_sub_number = 1

            for motion in motions:

                # Dictionary to store all motion info
                motion_obj = {}

                if DEBUG:
                    print(str(item_number) + '.' + str(item_sub_number))

                # Place "anchor"
                motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")]

                # Get motion title
                motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor]

                # Get list of who the motion is moved by
                moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")]
                moved_by_list  = [x.text for x in moved_by_list]

                # Get motion description
                motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")]

                # Get motion result
                motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")]

                # Get motion votes
                motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")]

                # Get motion attachments
                motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor]
                motion_attachments_list_names = []
                motion_attachments_list_links = []
                for x in motion_attachments_list:
                    motion_attachments_list_names.append([y.text for y in x]) # ?
                    motion_attachments_list_links.append([y['href'] for y in x])

                motion_obj["titles"] = motion_titles
                motion_obj["moved_by"] = moved_by_list
                motion_obj["details"] = motion_description_list
                motion_obj["results"] = motion_result_list
                motion_obj["votes"] = motion_votes_list
                motion_obj['attachment_names'] = motion_attachments_list_names[0]
                motion_obj['attachment_links'] = motion_attachments_list_links[0]
                motion_obj['attachment_count'] = len(motion_attachments_list_names[0])

                for desc in motion_description_list:                
                    if len(desc.split()) > s.max_length:
                        motion_obj['summary'] = s.summarize(text=desc)[0]
                    else:
                        motion_obj['summary'] = "Too short to summarize"


                if DEBUG:
                    print(str(item_number) + '.' + str(item_sub_number))
                    print(motion_titles) # title
                    print("Moved by: " + str(moved_by_list)) # Moved by
                    print(motion_description_list) # Other details
                    print("Result: " + str(motion_result_list)) # Result
                    print("Votes: " + str(motion_votes_list)) # Votes
                    print(motion_attachments_list_names[0]) # attachment names
                    print(motion_attachments_list_links[0]) # attachment links
                    print()

                # Append to JSON object
                JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj

                item_sub_number+=1

        if DEBUG:
            print('-----------------------------------\n\n\n')

        item_number+=1


    


    # # Serialize and write to "meeting_minutes.json"
    # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
    #     json.dump(JSON_obj, out, indent=4)

    # Add this to data base
    
    return JSON_obj

def minutes_scraper_no_sum(URL=""):
    if not isValidURL(URL):
        print("Invalid or missing URL input")
        print("Please enter a URL now:")

        return "Invalid URL"

    # Get output directory
    out_dir = ""
    out_dir = os.getcwd()

    ###

    s = summarizer() # Summarizer object

    # Object to be seriliazed
    JSON_obj = {}

    # Get meeting ID
    page = requests.get(URL)
    o = urlparse(URL)
    query = parse_qs(o.query)

    JSON_obj["meeting_id"] = query["Id"][0]

    # Complete HTML File
    soup = BeautifulSoup(page.content, "html.parser")

    # Most of the page content is found in this container
    page_content = soup.find(id="package-container")

    ###

    # MM Header
    agenda_header = page_content.find("header", class_="AgendaHeader")

    ## Header information

    # Get the Agenda Header
    try: 
        JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text
    except AttributeError:
        JSON_obj["agenda_header_subtitle"] = ""


    # Get the start time
    JSON_obj["start_time"] = agenda_header.find("time").text

    # Get the location
    try:
        JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info
    except AttributeError:
        JSON_obj["location"] = ""

    # Get the attendence (seperated by who can and can't vote)
    attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div")
    try:
        present = [x.text for x in attendance_table[2].find_all("li")]
    except IndexError:
        present = []

    try:
        also_present = [x.text for x in attendance_table[5].find_all("li")]
    except IndexError:
        also_present = []

    JSON_obj["attendance"] = {'present': present, 'also_present': also_present}

    ###

    # MM Body
    agenda_items = page_content.find("div", class_="AgendaItems")

    ## Body information

    # Get item containers
    agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent")

    # Get roll call
    try:
        roll_call = agenda_item_containers[0].find_all("p", class_="Body1")
        JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ')
    except IndexError:
        JSON_obj["roll_call"] = []

    if DEBUG:
        print(JSON_obj["roll_call"])

    # Get generator of item containers
    agenda_item_containers = agenda_items.children

    item_number = 1
    for agenda_item in agenda_item_containers:

        # Get titles
        titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")]
        
        # Get each motion in each item
        motions = agenda_item.find_all("ul", class_="AgendaItemMotions")

        if DEBUG:
            print(item_number)

        if motions != None:
            item_sub_number = 1

            for motion in motions:

                # Dictionary to store all motion info
                motion_obj = {}

                if DEBUG:
                    print(str(item_number) + '.' + str(item_sub_number))

                # Place "anchor"
                motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")]

                # Get motion title
                motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor]

                # Get list of who the motion is moved by
                moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")]
                moved_by_list  = [x.text for x in moved_by_list]

                # Get motion description
                motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")]

                # Get motion result
                motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")]

                # Get motion votes
                motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")]

                # Get motion attachments
                motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor]
                motion_attachments_list_names = []
                motion_attachments_list_links = []
                for x in motion_attachments_list:
                    motion_attachments_list_names.append([y.text for y in x]) # ?
                    motion_attachments_list_links.append([y['href'] for y in x])

                motion_obj["titles"] = motion_titles
                motion_obj["moved_by"] = moved_by_list
                motion_obj["details"] = motion_description_list
                motion_obj["results"] = motion_result_list
                motion_obj["votes"] = motion_votes_list
                motion_obj['attachment_names'] = motion_attachments_list_names[0]
                motion_obj['attachment_links'] = motion_attachments_list_links[0]
                motion_obj['attachment_count'] = len(motion_attachments_list_names[0])

                # for desc in motion_description_list:                
                #     if len(desc.split()) > s.max_length:
                #         motion_obj['summary'] = s.summarize(text=desc)[0]
                #     else:
                #         motion_obj['summary'] = "Too short to summarize"


                if DEBUG:
                    print(str(item_number) + '.' + str(item_sub_number))
                    print(motion_titles) # title
                    print("Moved by: " + str(moved_by_list)) # Moved by
                    print(motion_description_list) # Other details
                    print("Result: " + str(motion_result_list)) # Result
                    print("Votes: " + str(motion_votes_list)) # Votes
                    print(motion_attachments_list_names[0]) # attachment names
                    print(motion_attachments_list_links[0]) # attachment links
                    print()

                # Append to JSON object
                JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj

                item_sub_number+=1

        if DEBUG:
            print('-----------------------------------\n\n\n')

        item_number+=1


    


    # # Serialize and write to "meeting_minutes.json"
    # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
    #     json.dump(JSON_obj, out, indent=4)

    # Add this to data base
    
    return JSON_obj