noelfranthomas commited on
Commit
05ab1ec
1 Parent(s): 8449e2f
Files changed (2) hide show
  1. app.py +5 -3
  2. scraper.py +197 -0
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
7
  iface.launch()
 
1
  import gradio as gr
2
 
3
+ from scripts import minutes_scraper
 
4
 
5
+ # def greet(name):
6
+ # return "Hello " + name + "!!"
7
+
8
+ iface = gr.Interface(fn=minutes_scraper, inputs="text", outputs="JSON")
9
  iface.launch()
scraper.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse, parse_qs
4
+
5
+ from utils import isValidURL
6
+
7
+ import json
8
+ import sys
9
+ import os
10
+
11
+ # REQUIREMENTS: https://pixeltree.notion.site/City-Council-Scraping-34a2f5a24d59400faf9a128f2653ebf2
12
+ # Meeting Minutes Directory: https://pub-calgary.escribemeetings.com
13
+
14
+ # INPUT (arg 1): Valid URL pointing to meeting minutes. Needs to be wrapped in quotes
15
+ # OPTIONAL INPUT (arg 2): Output directory
16
+ # OUTPUT: JSON document containing required information scraped from input URL
17
+
18
+ # Debug mode
19
+ DEBUG = False
20
+
21
+ def minutes_scraper(URL=""):
22
+ if not isValidURL(URL):
23
+ print("Invalid or missing URL input")
24
+ print("Please enter a URL now:")
25
+
26
+ return "Invalid URL"
27
+
28
+ # Get output directory
29
+ out_dir = ""
30
+ out_dir = os.getcwd()
31
+
32
+ ###
33
+
34
+ # Object to be seriliazed
35
+ JSON_obj = {}
36
+
37
+ # Get meeting ID
38
+ page = requests.get(URL)
39
+ o = urlparse(URL)
40
+ query = parse_qs(o.query)
41
+
42
+ JSON_obj["meeting_id"] = query["Id"][0]
43
+
44
+ # Complete HTML File
45
+ soup = BeautifulSoup(page.content, "html.parser")
46
+
47
+ # Most of the page content is found in this container
48
+ page_content = soup.find(id="package-container")
49
+
50
+ ###
51
+
52
+ # MM Header
53
+ agenda_header = page_content.find("header", class_="AgendaHeader")
54
+
55
+ ## Header information
56
+
57
+ # Get the Agenda Header
58
+ try:
59
+ JSON_obj["agenda_header_subtitle"] = agenda_header.find("p", class_="AgendaHeaderSubTitle").text
60
+ except AttributeError:
61
+ JSON_obj["agenda_header_subtitle"] = ""
62
+
63
+
64
+ # Get the start time
65
+ JSON_obj["start_time"] = agenda_header.find("time").text
66
+
67
+ # Get the location
68
+ try:
69
+ JSON_obj["location"] = agenda_header_subtitle = agenda_header.find("div", class_="Value LocationValue").text ### This does not get all location info
70
+ except AttributeError:
71
+ JSON_obj["location"] = ""
72
+
73
+ # Get the attendence (seperated by who can and can't vote)
74
+ attendance_table = agenda_header.find(class_="AgendaHeaderAttendanceTable").find_all("div")
75
+ try:
76
+ present = [x.text for x in attendance_table[2].find_all("li")]
77
+ except IndexError:
78
+ present = []
79
+
80
+ try:
81
+ also_present = [x.text for x in attendance_table[5].find_all("li")]
82
+ except IndexError:
83
+ also_present = []
84
+
85
+ JSON_obj["attendance"] = {'present': present, 'also_present': also_present}
86
+
87
+ ###
88
+
89
+ # MM Body
90
+ agenda_items = page_content.find("div", class_="AgendaItems")
91
+
92
+ ## Body information
93
+
94
+ # Get item containers
95
+ agenda_item_containers = agenda_items.find_all("div", class_="AgendaItemContainer indent")
96
+
97
+ # Get roll call
98
+ try:
99
+ roll_call = agenda_item_containers[0].find_all("p", class_="Body1")
100
+ JSON_obj["roll_call"] = roll_call[2].text.rstrip('.').replace(', and ', ', ').split(', ')
101
+ except IndexError:
102
+ JSON_obj["roll_call"] = []
103
+
104
+ if DEBUG:
105
+ print(JSON_obj["roll_call"])
106
+
107
+ # Get generator of item containers
108
+ agenda_item_containers = agenda_items.children
109
+
110
+ item_number = 1
111
+ for agenda_item in agenda_item_containers:
112
+
113
+ # Get titles
114
+ titles = [x.text for x in agenda_item.find_all("div", class_="AgendaItemTitle")]
115
+
116
+ # Get each motion in each item
117
+ motions = agenda_item.find_all("ul", class_="AgendaItemMotions")
118
+
119
+ if DEBUG:
120
+ print(item_number)
121
+
122
+ if motions != None:
123
+ item_sub_number = 1
124
+
125
+ for motion in motions:
126
+
127
+ # Dictionary to store all motion info
128
+ motion_obj = {}
129
+
130
+ if DEBUG:
131
+ print(str(item_number) + '.' + str(item_sub_number))
132
+
133
+ # Place "anchor"
134
+ motion_anchor = [x.parent.parent.parent.parent for x in motion.find_all("div", class_="MotionText RichText")]
135
+
136
+ # Get motion title
137
+ motion_titles = [x.find("div", class_="AgendaItemTitle").text.strip() for x in motion_anchor]
138
+
139
+ # Get list of who the motion is moved by
140
+ moved_by_list = [x.find("span", class_="Value") for x in motion.find_all("div", class_="MovedBy")]
141
+ moved_by_list = [x.text for x in moved_by_list]
142
+
143
+ # Get motion description
144
+ motion_description_list = [x.text for x in motion.find_all("div", class_="MotionText RichText")]
145
+
146
+ # Get motion result
147
+ motion_result_list = [x.text for x in motion.find_all("div", class_="MotionResult")]
148
+
149
+ # Get motion votes
150
+ motion_votes_list = [x.text[x.text.find(')') + 1:].split(', and ') for x in motion.find_all("table", class_="MotionVoters")]
151
+
152
+ # Get motion attachments
153
+ motion_attachments_list = [x.find_all("a", class_="Link") for x in motion_anchor]
154
+ motion_attachments_list_names = []
155
+ motion_attachments_list_links = []
156
+ for x in motion_attachments_list:
157
+ motion_attachments_list_names.append([y.text for y in x]) # ?
158
+ motion_attachments_list_links.append([y['href'] for y in x])
159
+
160
+ motion_obj["titles"] = motion_titles
161
+ motion_obj["moved_by"] = moved_by_list
162
+ motion_obj["details"] = motion_description_list
163
+ motion_obj["results"] = motion_result_list
164
+ motion_obj["votes"] = motion_votes_list
165
+ motion_obj['attachment_names'] = motion_attachments_list_names[0]
166
+ motion_obj['attachment_links'] = motion_attachments_list_links[0]
167
+
168
+
169
+ if DEBUG:
170
+ print(str(item_number) + '.' + str(item_sub_number))
171
+ print(motion_titles) # title
172
+ print("Moved by: " + str(moved_by_list)) # Moved by
173
+ print(motion_description_list) # Other details
174
+ print("Result: " + str(motion_result_list)) # Result
175
+ print("Votes: " + str(motion_votes_list)) # Votes
176
+ print(motion_attachments_list_names[0]) # attachment names
177
+ print(motion_attachments_list_links[0]) # attachment links
178
+ print()
179
+
180
+ # Append to JSON object
181
+ JSON_obj[f'{item_number}.{item_sub_number}'] = motion_obj
182
+
183
+ item_sub_number+=1
184
+
185
+ if DEBUG:
186
+ print('-----------------------------------\n\n\n')
187
+
188
+ item_number+=1
189
+
190
+
191
+ # # Serialize and write to "meeting_minutes.json"
192
+ # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
193
+ # json.dump(JSON_obj, out, indent=4)
194
+
195
+ # Add this to data base
196
+
197
+ return JSON_obj