Spaces:

claytonsamples
/

newsletter

Sleeping

App Files Files Community

claytonsamples commited on Aug 14, 2023

Commit

e38196e

1 Parent(s): d804491

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -31

app.py CHANGED Viewed

@@ -3,11 +3,11 @@
 [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
 '''
-import os,re, requests, uuid, zipfile, hashlib, shutil
 import gradio as gr
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
-from transformers import pipeline
 import torch
 # Function to validate URLs
@@ -23,34 +23,29 @@ def finder(url, soup, media_type):
         for tag in text_tags:
             for element in soup.find_all(tag):
                 files.append(element.get_text())
-    # Find links
-    else:
-        for link in soup.find_all('a'):
-            file = link.get('href')
-            if file and media_type in file:
-                file_url = file
-                if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
-                    file_url = urljoin(url, file_url)
-                files.append(file_url)
     return files
-def summarize_long_text(text, chunk_size=1024):
     # Initialize the summarization pipeline
-    summarizer = pipeline('summarization')
-    # Tokenize the text into words
-    words = text.split()
-    # Split the words into chunks of the specified size
-    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-    # Summarize each chunk
-    summarized_chunks = [summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
-    # Combine the summarized chunks into the final summary
-    final_summary = ' '.join(summarized_chunks)
-    return final_summary
 def scrapper(url):
     try:
@@ -59,20 +54,17 @@ def scrapper(url):
     except (requests.exceptions.RequestException, ValueError) as e:
         raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
         return None
     soup = BeautifulSoup(response.content, 'html.parser')
     # Add text files to the text folder
     text_content = finder(url, soup, 'text')
     os.makedirs('text', exist_ok=True)
-    full_text = ''
-    if text_content:
-        with open('text/content.txt', 'w') as text_file:
-            for line in text_content:
-                text_file.write(line + '\n')
-                full_text += line + ' '
-    # Initialize the summarization pipeline
     summary = summarize_long_text(full_text)
     return summary

 [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
 '''
+import os, re, requests, uuid, zipfile, hashlib, shutil
 import gradio as gr
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
+from transformers import pipeline, AutoTokenizer
 import torch
 # Function to validate URLs
         for tag in text_tags:
             for element in soup.find_all(tag):
                 files.append(element.get_text())
     return files
+def summarize_long_text(text, model_name="facebook/bart-large-cnn", max_chunk_tokens=500):
     # Initialize the summarization pipeline
+    summarizer = pipeline('summarization', model=model_name)
+    # Initialize the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Tokenize the text
+    tokens = tokenizer.encode(text)
+    # Split the tokens into chunks of the specified size
+    chunks = [tokens[i:i + max_chunk_tokens] for i in range(0, len(tokens), max_chunk_tokens)]
+    # Summarize each chunk and combine the results
+    final_summary = ''
+    for chunk in chunks:
+        chunk_text = tokenizer.decode(chunk)
+        summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
+        final_summary += ' ' + summary
+    return final_summary.strip()
 def scrapper(url):
     try:
     except (requests.exceptions.RequestException, ValueError) as e:
         raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
         return None
     soup = BeautifulSoup(response.content, 'html.parser')
     # Add text files to the text folder
     text_content = finder(url, soup, 'text')
     os.makedirs('text', exist_ok=True)
+    full_text = ' '.join(text_content) # Join the text content into a single string
+    # Save the full text to a file
+    with open('text/content.txt', 'w') as text_file:
+        text_file.write(full_text)
+    # Summarize the text
     summary = summarize_long_text(full_text)
     return summary