newsletter / app.py
claytonsamples's picture
Update app.py
342a5a7
raw
history blame
4.41 kB
'''
# Web Scrapping
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
'''
import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from transformers import pipeline
import torch
# Function to validate URLs
def validator(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def finder(url, soup, media_type):
files = []
# Find text
if media_type == "text":
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
for tag in text_tags:
for element in soup.find_all(tag):
files.append(element.get_text())
# Find links
else:
for link in soup.find_all('a'):
file = link.get('href')
if file and media_type in file:
file_url = file
if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
file_url = urljoin(url, file_url)
files.append(file_url)
return files
def summarize_long_text(text, chunk_size=1024):
# Initialize the summarization pipeline
summarizer = pipeline('summarization')
# Tokenize the text into words
words = text.split()
# Split the words into chunks of the specified size
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
# Summarize each chunk
summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
# Combine the summarized chunks into the final summary
final_summary = ' '.join(summarized_chunks)
return final_summary
def scrapper(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.exceptions.RequestException, ValueError) as e:
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Add text files to the text folder
text_content = finder(url, soup, 'text')
os.makedirs('text', exist_ok=True)
full_text = ''
if text_content:
with open('text/content.txt', 'w') as text_file:
for line in text_content:
text_file.write(line + '\n')
full_text += line + ' '
# Initialize the summarization pipeline
summary = summarize_long_text(full_text)
return summary
def checker(url):
if not url:
raise Exception("URL cannot be empty.")
if not url.startswith("https://"):
raise Exception("The URL must begin with https://")
try:
summary_text = scrapper(url)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
else:
raise Exception(f"HTTP Error: {e.response.status_code}")
except TypeError as e:
raise Exception(f"TypeError: {str(e)}")
except (requests.exceptions.RequestException, ValueError) as e:
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
if not summary_text:
raise Exception("Found no text.")
print(f"Returning summarized text from {url} ...")
return summary_text
with gr.Blocks(theme="dwancin/theme") as app:
title = gr.Markdown('''# Web Scraping 🕵️''')
description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
with gr.Row():
with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
url_name = gr.Textbox(
placeholder="Enter URL here",
show_label=True,
label="Website",
)
submit_button = gr.Button(
"Submit",
variant="primary",
interactive=True,
)
with gr.Column(scale=2):
summary_output = gr.Textbox(
label="Summary",
elem_id="summary-text",
size="lg",
show_label=False,
readonly=True,
)
submit_button.click(
checker,
inputs=[url_name],
outputs=[summary_output],
)
app.launch()