noelfranthomas commited on
Commit
e9ebb6e
1 Parent(s): ae8b33b
Files changed (4) hide show
  1. app.py +1 -1
  2. nlp.py +13 -0
  3. requirements.txt +12 -0
  4. scraper.py → web_scraper.py +16 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
 
3
- from scraper import minutes_scraper
4
 
5
  # def greet(name):
6
  # return "Hello " + name + "!!"
1
  import gradio as gr
2
 
3
+ from web_scraper import minutes_scraper
4
 
5
  # def greet(name):
6
  # return "Hello " + name + "!!"
nlp.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ class summarizer:
4
+
5
+ def __init__(self, max_length=30, min_length=5):
6
+ self.max_length = max_length
7
+ self.min_length = min_length
8
+
9
+ def summarize(self, text):
10
+ classifier = pipeline(task="summarization", model="facebook/bart-large-cnn", max_length=self.max_length, min_length=self.min_length, num_beams=4)
11
+ summary = classifier(text)
12
+
13
+ return summary
requirements.txt CHANGED
@@ -1,8 +1,20 @@
1
  beautifulsoup4==4.11.1
2
  certifi==2022.6.15
3
  charset-normalizer==2.1.1
 
 
4
  idna==3.3
 
 
5
  protobuf==3.19.4
 
 
 
6
  requests==2.28.1
7
  soupsieve==2.3.2.post1
 
 
 
 
 
8
  urllib3==1.26.12
1
  beautifulsoup4==4.11.1
2
  certifi==2022.6.15
3
  charset-normalizer==2.1.1
4
+ filelock==3.8.0
5
+ huggingface-hub==0.9.1
6
  idna==3.3
7
+ numpy==1.23.3
8
+ packaging==21.3
9
  protobuf==3.19.4
10
+ pyparsing==3.0.9
11
+ PyYAML==6.0
12
+ regex==2022.9.13
13
  requests==2.28.1
14
  soupsieve==2.3.2.post1
15
+ tokenizers==0.12.1
16
+ torch==1.12.1
17
+ tqdm==4.64.1
18
+ transformers==4.22.1
19
+ typing_extensions==4.3.0
20
  urllib3==1.26.12
scraper.py → web_scraper.py RENAMED
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
3
  from urllib.parse import urlparse, parse_qs
4
 
5
  from utils import isValidURL
 
6
 
7
  import json
8
  import sys
@@ -18,6 +19,9 @@ import os
18
  # Debug mode
19
  DEBUG = False
20
 
 
 
 
21
  def minutes_scraper(URL=""):
22
  if not isValidURL(URL):
23
  print("Invalid or missing URL input")
@@ -31,6 +35,8 @@ def minutes_scraper(URL=""):
31
 
32
  ###
33
 
 
 
34
  # Object to be seriliazed
35
  JSON_obj = {}
36
 
@@ -164,6 +170,13 @@ def minutes_scraper(URL=""):
164
  motion_obj["votes"] = motion_votes_list
165
  motion_obj['attachment_names'] = motion_attachments_list_names[0]
166
  motion_obj['attachment_links'] = motion_attachments_list_links[0]
 
 
 
 
 
 
 
167
 
168
 
169
  if DEBUG:
@@ -188,6 +201,9 @@ def minutes_scraper(URL=""):
188
  item_number+=1
189
 
190
 
 
 
 
191
  # # Serialize and write to "meeting_minutes.json"
192
  # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
193
  # json.dump(JSON_obj, out, indent=4)
3
  from urllib.parse import urlparse, parse_qs
4
 
5
  from utils import isValidURL
6
+ from nlp import summarizer
7
 
8
  import json
9
  import sys
19
  # Debug mode
20
  DEBUG = False
21
 
22
+ if __name__ == "__main__":
23
+ DEBUG = True
24
+
25
  def minutes_scraper(URL=""):
26
  if not isValidURL(URL):
27
  print("Invalid or missing URL input")
35
 
36
  ###
37
 
38
+ s = summarizer() # Summarizer object
39
+
40
  # Object to be seriliazed
41
  JSON_obj = {}
42
 
170
  motion_obj["votes"] = motion_votes_list
171
  motion_obj['attachment_names'] = motion_attachments_list_names[0]
172
  motion_obj['attachment_links'] = motion_attachments_list_links[0]
173
+ motion_obj['attachment_count'] = len(motion_attachments_list_names[0])
174
+
175
+ for desc in motion_description_list:
176
+ if len(desc.split()) > s.max_length:
177
+ motion_obj['summary'] = s.summarize(text=desc)[0]
178
+ else:
179
+ motion_obj['summary'] = "Too short to summarize"
180
 
181
 
182
  if DEBUG:
201
  item_number+=1
202
 
203
 
204
+
205
+
206
+
207
  # # Serialize and write to "meeting_minutes.json"
208
  # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
209
  # json.dump(JSON_obj, out, indent=4)