Spaces:

noelfranthomas
/

test_transparencyAPI

Runtime error

App Files Files Community

noelfranthomas commited on Sep 27, 2022

Commit

01ef47d

1 Parent(s): e9ebb6e

NLP

Browse files

Files changed (2) hide show

app.py +20 -5
web_scraper.py +190 -0

app.py CHANGED Viewed

@@ -1,9 +1,24 @@
 import gradio as gr
-from web_scraper import minutes_scraper
-# def greet(name):
-#     return "Hello " + name + "!!"
-iface = gr.Interface(fn=minutes_scraper, inputs="text", outputs="json")
-iface.launch()

 import gradio as gr
+from web_scraper import minutes_scraper, minutes_scraper_no_sum
+with gr.Blocks() as app:
+    gr.Markdown("Get the meeting minutes from a URL")
+    with gr.Tab("Without Summary"):
+        text_input = gr.Textbox()
+        text_output = gr.JSON()
+        text_button = gr.Button("Scrape")
+    with gr.Tab("With Summary (Slower)"):
+        text_input = gr.Textbox()
+        text_output = gr.JSON()
+        text_button = gr.Button("Scrape & Summarize")
+        with gr.Accordion("Note on Summary"):
+            gr.Markdown("The summary is generated using the [Facebook BART model](https://huggingface.co/facebook/bart-large-cnn). The summary is not perfect, but it is a good starting point for a quick overview of the meeting. Please bear in mind that this process may take longer depending on the amount of text to summarize.")
+    text_button.click(minutes_scraper, inputs=text_input, outputs=text_output)
+    image_button.click(minutes_scraper_no_sum(), inputs=image_input, outputs=image_output)
+app.launch()

web_scraper.py CHANGED Viewed

@@ -204,6 +204,196 @@ def minutes_scraper(URL=""):
     # # Serialize and write to "meeting_minutes.json"
     # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
     #     json.dump(JSON_obj, out, indent=4)

+    # # Serialize and write to "meeting_minutes.json"
+    # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
+    #     json.dump(JSON_obj, out, indent=4)
+    # Add this to data base
+    return JSON_obj
+def minutes_scraper_no_sum(URL=""):
+    if not isValidURL(URL):
+        print("Invalid or missing URL input")
+        print("Please enter a URL now:")
+        return "Invalid URL"
+    # Get output directory
+    out_dir = ""
+    out_dir = os.getcwd()
+    ###
+    s = summarizer() # Summarizer object
+    # Object to be seriliazed
+    JSON_obj = {}
+    # Get meeting ID
+    page = requests.get(URL)
+    o = urlparse(URL)
+    query = parse_qs(o.query)
+    JSON_obj["meeting_id"] = query["Id"][0]
+    # Complete HTML File
+    soup = BeautifulSoup(page.content, "html.parser")
+    # Most of the page content is found in this container
+    page_content = soup.find(id="package-container")
+    ###
+    # MM Header
+    agenda_header = page_content.find("header", class_="AgendaHeader")
+    ## Header information
+    # Get the Agenda Header
+    try:
+        JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text
+    except AttributeError:
+        JSON_obj["agenda_header_subtitle"] = ""
+    # Get the start time
+    JSON_obj["start_time"] = agenda_header.find("time").text
+    # Get the location
+    try:
+        JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info
+    except AttributeError:
+        JSON_obj["location"] = ""
+    # Get the attendence (seperated by who can and can't vote)
+    attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div")
+    try:
+        present = [x.text for x in attendance_table[2].find_all("li")]
+    except IndexError:
+        present = []
+    try:
+        also_present = [x.text for x in attendance_table[5].find_all("li")]
+    except IndexError:
+        also_present = []
+    JSON_obj["attendance"] = {'present': present, 'also_present': also_present}
+    ###
+    # MM Body
+    agenda_items = page_content.find("div", class_="AgendaItems")
+    ## Body information
+    # Get item containers
+    agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent")
+    # Get roll call
+    try:
+        roll_call = agenda_item_containers[0].find_all("p", class_="Body1")
+        JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ')
+    except IndexError:
+        JSON_obj["roll_call"] = []
+    if DEBUG:
+        print(JSON_obj["roll_call"])
+    # Get generator of item containers
+    agenda_item_containers = agenda_items.children
+    item_number = 1
+    for agenda_item in agenda_item_containers:
+        # Get titles
+        titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")]
+        # Get each motion in each item
+        motions = agenda_item.find_all("ul", class_="AgendaItemMotions")
+        if DEBUG:
+            print(item_number)
+        if motions != None:
+            item_sub_number = 1
+            for motion in motions:
+                # Dictionary to store all motion info
+                motion_obj = {}
+                if DEBUG:
+                    print(str(item_number) + '.' + str(item_sub_number))
+                # Place "anchor"
+                motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")]
+                # Get motion title
+                motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor]
+                # Get list of who the motion is moved by
+                moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")]
+                moved_by_list  = [x.text for x in moved_by_list]
+                # Get motion description
+                motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")]
+                # Get motion result
+                motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")]
+                # Get motion votes
+                motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")]
+                # Get motion attachments
+                motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor]
+                motion_attachments_list_names = []
+                motion_attachments_list_links = []
+                for x in motion_attachments_list:
+                    motion_attachments_list_names.append([y.text for y in x]) # ?
+                    motion_attachments_list_links.append([y['href'] for y in x])
+                motion_obj["titles"] = motion_titles
+                motion_obj["moved_by"] = moved_by_list
+                motion_obj["details"] = motion_description_list
+                motion_obj["results"] = motion_result_list
+                motion_obj["votes"] = motion_votes_list
+                motion_obj['attachment_names'] = motion_attachments_list_names[0]
+                motion_obj['attachment_links'] = motion_attachments_list_links[0]
+                motion_obj['attachment_count'] = len(motion_attachments_list_names[0])
+                # for desc in motion_description_list:
+                #     if len(desc.split()) > s.max_length:
+                #         motion_obj['summary'] = s.summarize(text=desc)[0]
+                #     else:
+                #         motion_obj['summary'] = "Too short to summarize"
+                if DEBUG:
+                    print(str(item_number) + '.' + str(item_sub_number))
+                    print(motion_titles) # title
+                    print("Moved by: " + str(moved_by_list)) # Moved by
+                    print(motion_description_list) # Other details
+                    print("Result: " + str(motion_result_list)) # Result
+                    print("Votes: " + str(motion_votes_list)) # Votes
+                    print(motion_attachments_list_names[0]) # attachment names
+                    print(motion_attachments_list_links[0]) # attachment links
+                    print()
+                # Append to JSON object
+                JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj
+                item_sub_number+=1
+        if DEBUG:
+            print('-----------------------------------\n\n\n')
+        item_number+=1
     # # Serialize and write to "meeting_minutes.json"
     # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
     #     json.dump(JSON_obj, out, indent=4)