Spaces:
Runtime error
Runtime error
noelfranthomas
commited on
Commit
•
e9ebb6e
1
Parent(s):
ae8b33b
NLP
Browse files- app.py +1 -1
- nlp.py +13 -0
- requirements.txt +12 -0
- scraper.py → web_scraper.py +16 -0
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
from
|
4 |
|
5 |
# def greet(name):
|
6 |
# return "Hello " + name + "!!"
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from web_scraper import minutes_scraper
|
4 |
|
5 |
# def greet(name):
|
6 |
# return "Hello " + name + "!!"
|
nlp.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
class summarizer:
|
4 |
+
|
5 |
+
def __init__(self, max_length=30, min_length=5):
|
6 |
+
self.max_length = max_length
|
7 |
+
self.min_length = min_length
|
8 |
+
|
9 |
+
def summarize(self, text):
|
10 |
+
classifier = pipeline(task="summarization", model="facebook/bart-large-cnn", max_length=self.max_length, min_length=self.min_length, num_beams=4)
|
11 |
+
summary = classifier(text)
|
12 |
+
|
13 |
+
return summary
|
requirements.txt
CHANGED
@@ -1,8 +1,20 @@
|
|
1 |
beautifulsoup4==4.11.1
|
2 |
certifi==2022.6.15
|
3 |
charset-normalizer==2.1.1
|
|
|
|
|
4 |
idna==3.3
|
|
|
|
|
5 |
protobuf==3.19.4
|
|
|
|
|
|
|
6 |
requests==2.28.1
|
7 |
soupsieve==2.3.2.post1
|
|
|
|
|
|
|
|
|
|
|
8 |
urllib3==1.26.12
|
|
|
1 |
beautifulsoup4==4.11.1
|
2 |
certifi==2022.6.15
|
3 |
charset-normalizer==2.1.1
|
4 |
+
filelock==3.8.0
|
5 |
+
huggingface-hub==0.9.1
|
6 |
idna==3.3
|
7 |
+
numpy==1.23.3
|
8 |
+
packaging==21.3
|
9 |
protobuf==3.19.4
|
10 |
+
pyparsing==3.0.9
|
11 |
+
PyYAML==6.0
|
12 |
+
regex==2022.9.13
|
13 |
requests==2.28.1
|
14 |
soupsieve==2.3.2.post1
|
15 |
+
tokenizers==0.12.1
|
16 |
+
torch==1.12.1
|
17 |
+
tqdm==4.64.1
|
18 |
+
transformers==4.22.1
|
19 |
+
typing_extensions==4.3.0
|
20 |
urllib3==1.26.12
|
scraper.py → web_scraper.py
RENAMED
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
|
|
3 |
from urllib.parse import urlparse, parse_qs
|
4 |
|
5 |
from utils import isValidURL
|
|
|
6 |
|
7 |
import json
|
8 |
import sys
|
@@ -18,6 +19,9 @@ import os
|
|
18 |
# Debug mode
|
19 |
DEBUG = False
|
20 |
|
|
|
|
|
|
|
21 |
def minutes_scraper(URL=""):
|
22 |
if not isValidURL(URL):
|
23 |
print("Invalid or missing URL input")
|
@@ -31,6 +35,8 @@ def minutes_scraper(URL=""):
|
|
31 |
|
32 |
###
|
33 |
|
|
|
|
|
34 |
# Object to be seriliazed
|
35 |
JSON_obj = {}
|
36 |
|
@@ -164,6 +170,13 @@ def minutes_scraper(URL=""):
|
|
164 |
motion_obj["votes"] = motion_votes_list
|
165 |
motion_obj['attachment_names'] = motion_attachments_list_names[0]
|
166 |
motion_obj['attachment_links'] = motion_attachments_list_links[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
|
169 |
if DEBUG:
|
@@ -188,6 +201,9 @@ def minutes_scraper(URL=""):
|
|
188 |
item_number+=1
|
189 |
|
190 |
|
|
|
|
|
|
|
191 |
# # Serialize and write to "meeting_minutes.json"
|
192 |
# with open(f"{out_dir}/meeting_minutes.json", "w") as out:
|
193 |
# json.dump(JSON_obj, out, indent=4)
|
|
|
3 |
from urllib.parse import urlparse, parse_qs
|
4 |
|
5 |
from utils import isValidURL
|
6 |
+
from nlp import summarizer
|
7 |
|
8 |
import json
|
9 |
import sys
|
|
|
19 |
# Debug mode
|
20 |
DEBUG = False
|
21 |
|
22 |
+
if __name__ == "__main__":
|
23 |
+
DEBUG = True
|
24 |
+
|
25 |
def minutes_scraper(URL=""):
|
26 |
if not isValidURL(URL):
|
27 |
print("Invalid or missing URL input")
|
|
|
35 |
|
36 |
###
|
37 |
|
38 |
+
s = summarizer() # Summarizer object
|
39 |
+
|
40 |
# Object to be seriliazed
|
41 |
JSON_obj = {}
|
42 |
|
|
|
170 |
motion_obj["votes"] = motion_votes_list
|
171 |
motion_obj['attachment_names'] = motion_attachments_list_names[0]
|
172 |
motion_obj['attachment_links'] = motion_attachments_list_links[0]
|
173 |
+
motion_obj['attachment_count'] = len(motion_attachments_list_names[0])
|
174 |
+
|
175 |
+
for desc in motion_description_list:
|
176 |
+
if len(desc.split()) > s.max_length:
|
177 |
+
motion_obj['summary'] = s.summarize(text=desc)[0]
|
178 |
+
else:
|
179 |
+
motion_obj['summary'] = "Too short to summarize"
|
180 |
|
181 |
|
182 |
if DEBUG:
|
|
|
201 |
item_number+=1
|
202 |
|
203 |
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
# # Serialize and write to "meeting_minutes.json"
|
208 |
# with open(f"{out_dir}/meeting_minutes.json", "w") as out:
|
209 |
# json.dump(JSON_obj, out, indent=4)
|