Spaces:

noelfranthomas
/

test_transparencyAPI

Runtime error

noelfranthomas commited on Sep 27, 2022

Commit

e9ebb6e

•

1 Parent(s): ae8b33b

NLP

Files changed (4) hide show

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
-from scraper import minutes_scraper
 # def greet(name):
 #     return "Hello " + name + "!!"

 import gradio as gr
+from web_scraper import minutes_scraper
 # def greet(name):
 #     return "Hello " + name + "!!"

nlp.py ADDED Viewed

+from transformers import pipeline
+class summarizer:
+    def __init__(self, max_length=30, min_length=5):
+        self.max_length = max_length
+        self.min_length = min_length
+    def summarize(self, text):
+        classifier = pipeline(task="summarization", model="facebook/bart-large-cnn", max_length=self.max_length, min_length=self.min_length, num_beams=4)
+        summary = classifier(text)
+        return summary

requirements.txt CHANGED Viewed

@@ -1,8 +1,20 @@
 beautifulsoup4==4.11.1
 certifi==2022.6.15
 charset-normalizer==2.1.1
 idna==3.3
 protobuf==3.19.4
 requests==2.28.1
 soupsieve==2.3.2.post1
 urllib3==1.26.12

 beautifulsoup4==4.11.1
 certifi==2022.6.15
 charset-normalizer==2.1.1
+filelock==3.8.0
+huggingface-hub==0.9.1
 idna==3.3
+numpy==1.23.3
+packaging==21.3
 protobuf==3.19.4
+pyparsing==3.0.9
+PyYAML==6.0
+regex==2022.9.13
 requests==2.28.1
 soupsieve==2.3.2.post1
+tokenizers==0.12.1
+torch==1.12.1
+tqdm==4.64.1
+transformers==4.22.1
+typing_extensions==4.3.0
 urllib3==1.26.12

scraper.py → web_scraper.py RENAMED Viewed

@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs
 from utils import isValidURL
 import json
 import sys
@@ -18,6 +19,9 @@ import os
 # Debug mode
 DEBUG = False
 def minutes_scraper(URL=""):
     if not isValidURL(URL):
         print("Invalid or missing URL input")
@@ -31,6 +35,8 @@ def minutes_scraper(URL=""):
     ###
     # Object to be seriliazed
     JSON_obj = {}
@@ -164,6 +170,13 @@ def minutes_scraper(URL=""):
                 motion_obj["votes"] = motion_votes_list
                 motion_obj['attachment_names'] = motion_attachments_list_names[0]
                 motion_obj['attachment_links'] = motion_attachments_list_links[0]
                 if DEBUG:
@@ -188,6 +201,9 @@ def minutes_scraper(URL=""):
         item_number+=1
     # # Serialize and write to "meeting_minutes.json"
     # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
     #     json.dump(JSON_obj, out, indent=4)

 from urllib.parse import urlparse, parse_qs
 from utils import isValidURL
+from nlp import summarizer
 import json
 import sys
 # Debug mode
 DEBUG = False
+if __name__ == "__main__":
+    DEBUG = True
 def minutes_scraper(URL=""):
     if not isValidURL(URL):
         print("Invalid or missing URL input")
     ###
+    s = summarizer() # Summarizer object
     # Object to be seriliazed
     JSON_obj = {}
                 motion_obj["votes"] = motion_votes_list
                 motion_obj['attachment_names'] = motion_attachments_list_names[0]
                 motion_obj['attachment_links'] = motion_attachments_list_links[0]
+                motion_obj['attachment_count'] = len(motion_attachments_list_names[0])
+                for desc in motion_description_list:
+                    if len(desc.split()) > s.max_length:
+                        motion_obj['summary'] = s.summarize(text=desc)[0]
+                    else:
+                        motion_obj['summary'] = "Too short to summarize"
                 if DEBUG:
         item_number+=1
     # # Serialize and write to "meeting_minutes.json"
     # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
     #     json.dump(JSON_obj, out, indent=4)