Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						e9ebb6e
	
1
								Parent(s):
							
							ae8b33b
								
NLP
Browse files- app.py +1 -1
- nlp.py +13 -0
- requirements.txt +12 -0
- scraper.py → web_scraper.py +16 -0
    	
        app.py
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
             
            import gradio as gr
         | 
| 2 |  | 
| 3 | 
            -
            from  | 
| 4 |  | 
| 5 | 
             
            # def greet(name):
         | 
| 6 | 
             
            #     return "Hello " + name + "!!"
         | 
|  | |
| 1 | 
             
            import gradio as gr
         | 
| 2 |  | 
| 3 | 
            +
            from web_scraper import minutes_scraper
         | 
| 4 |  | 
| 5 | 
             
            # def greet(name):
         | 
| 6 | 
             
            #     return "Hello " + name + "!!"
         | 
    	
        nlp.py
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from transformers import pipeline
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            class summarizer:
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                def __init__(self, max_length=30, min_length=5):
         | 
| 6 | 
            +
                    self.max_length = max_length
         | 
| 7 | 
            +
                    self.min_length = min_length
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def summarize(self, text):
         | 
| 10 | 
            +
                    classifier = pipeline(task="summarization", model="facebook/bart-large-cnn", max_length=self.max_length, min_length=self.min_length, num_beams=4)
         | 
| 11 | 
            +
                    summary = classifier(text)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                    return summary
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,8 +1,20 @@ | |
| 1 | 
             
            beautifulsoup4==4.11.1
         | 
| 2 | 
             
            certifi==2022.6.15
         | 
| 3 | 
             
            charset-normalizer==2.1.1
         | 
|  | |
|  | |
| 4 | 
             
            idna==3.3
         | 
|  | |
|  | |
| 5 | 
             
            protobuf==3.19.4
         | 
|  | |
|  | |
|  | |
| 6 | 
             
            requests==2.28.1
         | 
| 7 | 
             
            soupsieve==2.3.2.post1
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 8 | 
             
            urllib3==1.26.12
         | 
|  | |
| 1 | 
             
            beautifulsoup4==4.11.1
         | 
| 2 | 
             
            certifi==2022.6.15
         | 
| 3 | 
             
            charset-normalizer==2.1.1
         | 
| 4 | 
            +
            filelock==3.8.0
         | 
| 5 | 
            +
            huggingface-hub==0.9.1
         | 
| 6 | 
             
            idna==3.3
         | 
| 7 | 
            +
            numpy==1.23.3
         | 
| 8 | 
            +
            packaging==21.3
         | 
| 9 | 
             
            protobuf==3.19.4
         | 
| 10 | 
            +
            pyparsing==3.0.9
         | 
| 11 | 
            +
            PyYAML==6.0
         | 
| 12 | 
            +
            regex==2022.9.13
         | 
| 13 | 
             
            requests==2.28.1
         | 
| 14 | 
             
            soupsieve==2.3.2.post1
         | 
| 15 | 
            +
            tokenizers==0.12.1
         | 
| 16 | 
            +
            torch==1.12.1
         | 
| 17 | 
            +
            tqdm==4.64.1
         | 
| 18 | 
            +
            transformers==4.22.1
         | 
| 19 | 
            +
            typing_extensions==4.3.0
         | 
| 20 | 
             
            urllib3==1.26.12
         | 
    	
        scraper.py → web_scraper.py
    RENAMED
    
    | @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup | |
| 3 | 
             
            from urllib.parse import urlparse, parse_qs
         | 
| 4 |  | 
| 5 | 
             
            from utils import isValidURL
         | 
|  | |
| 6 |  | 
| 7 | 
             
            import json
         | 
| 8 | 
             
            import sys
         | 
| @@ -18,6 +19,9 @@ import os | |
| 18 | 
             
            # Debug mode
         | 
| 19 | 
             
            DEBUG = False
         | 
| 20 |  | 
|  | |
|  | |
|  | |
| 21 | 
             
            def minutes_scraper(URL=""):
         | 
| 22 | 
             
                if not isValidURL(URL):
         | 
| 23 | 
             
                    print("Invalid or missing URL input")
         | 
| @@ -31,6 +35,8 @@ def minutes_scraper(URL=""): | |
| 31 |  | 
| 32 | 
             
                ###
         | 
| 33 |  | 
|  | |
|  | |
| 34 | 
             
                # Object to be seriliazed
         | 
| 35 | 
             
                JSON_obj = {}
         | 
| 36 |  | 
| @@ -164,6 +170,13 @@ def minutes_scraper(URL=""): | |
| 164 | 
             
                            motion_obj["votes"] = motion_votes_list
         | 
| 165 | 
             
                            motion_obj['attachment_names'] = motion_attachments_list_names[0]
         | 
| 166 | 
             
                            motion_obj['attachment_links'] = motion_attachments_list_links[0]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 167 |  | 
| 168 |  | 
| 169 | 
             
                            if DEBUG:
         | 
| @@ -188,6 +201,9 @@ def minutes_scraper(URL=""): | |
| 188 | 
             
                    item_number+=1
         | 
| 189 |  | 
| 190 |  | 
|  | |
|  | |
|  | |
| 191 | 
             
                # # Serialize and write to "meeting_minutes.json"
         | 
| 192 | 
             
                # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
         | 
| 193 | 
             
                #     json.dump(JSON_obj, out, indent=4)
         | 
|  | |
| 3 | 
             
            from urllib.parse import urlparse, parse_qs
         | 
| 4 |  | 
| 5 | 
             
            from utils import isValidURL
         | 
| 6 | 
            +
            from nlp import summarizer
         | 
| 7 |  | 
| 8 | 
             
            import json
         | 
| 9 | 
             
            import sys
         | 
|  | |
| 19 | 
             
            # Debug mode
         | 
| 20 | 
             
            DEBUG = False
         | 
| 21 |  | 
| 22 | 
            +
            if __name__ == "__main__":
         | 
| 23 | 
            +
                DEBUG = True
         | 
| 24 | 
            +
             | 
| 25 | 
             
            def minutes_scraper(URL=""):
         | 
| 26 | 
             
                if not isValidURL(URL):
         | 
| 27 | 
             
                    print("Invalid or missing URL input")
         | 
|  | |
| 35 |  | 
| 36 | 
             
                ###
         | 
| 37 |  | 
| 38 | 
            +
                s = summarizer() # Summarizer object
         | 
| 39 | 
            +
             | 
| 40 | 
             
                # Object to be seriliazed
         | 
| 41 | 
             
                JSON_obj = {}
         | 
| 42 |  | 
|  | |
| 170 | 
             
                            motion_obj["votes"] = motion_votes_list
         | 
| 171 | 
             
                            motion_obj['attachment_names'] = motion_attachments_list_names[0]
         | 
| 172 | 
             
                            motion_obj['attachment_links'] = motion_attachments_list_links[0]
         | 
| 173 | 
            +
                            motion_obj['attachment_count'] = len(motion_attachments_list_names[0])
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                            for desc in motion_description_list:                
         | 
| 176 | 
            +
                                if len(desc.split()) > s.max_length:
         | 
| 177 | 
            +
                                    motion_obj['summary'] = s.summarize(text=desc)[0]
         | 
| 178 | 
            +
                                else:
         | 
| 179 | 
            +
                                    motion_obj['summary'] = "Too short to summarize"
         | 
| 180 |  | 
| 181 |  | 
| 182 | 
             
                            if DEBUG:
         | 
|  | |
| 201 | 
             
                    item_number+=1
         | 
| 202 |  | 
| 203 |  | 
| 204 | 
            +
                
         | 
| 205 | 
            +
             | 
| 206 | 
            +
             | 
| 207 | 
             
                # # Serialize and write to "meeting_minutes.json"
         | 
| 208 | 
             
                # with open(f"{out_dir}/meeting_minutes.json", "w") as out:
         | 
| 209 | 
             
                #     json.dump(JSON_obj, out, indent=4)
         |