Spaces:
Sleeping
Sleeping
''' | |
# Web Scrapping | |
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping) | |
''' | |
import os,re, requests, uuid, zipfile, hashlib, shutil | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
from transformers import pipeline | |
import torch | |
# Function to validate URLs | |
def validator(url): | |
parsed = urlparse(url) | |
return bool(parsed.netloc) and bool(parsed.scheme) | |
def finder(url, soup, media_type): | |
files = [] | |
# Find text | |
if media_type == "text": | |
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong'] | |
for tag in text_tags: | |
for element in soup.find_all(tag): | |
files.append(element.get_text()) | |
# Find links | |
else: | |
for link in soup.find_all('a'): | |
file = link.get('href') | |
if file and media_type in file: | |
file_url = file | |
if not validator(file_url): # Assuming 'validator' is a function defined elsewhere | |
file_url = urljoin(url, file_url) | |
files.append(file_url) | |
return files | |
def summarize_long_text(text, chunk_size=1024): | |
# Initialize the summarization pipeline | |
summarizer = pipeline('summarization') | |
# Tokenize the text into words | |
words = text.split() | |
# Split the words into chunks of the specified size | |
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] | |
# Summarize each chunk | |
summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks] | |
# Combine the summarized chunks into the final summary | |
final_summary = ' '.join(summarized_chunks) | |
return final_summary | |
def scrapper(url): | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
except (requests.exceptions.RequestException, ValueError) as e: | |
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}") | |
return None | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Add text files to the text folder | |
text_content = finder(url, soup, 'text') | |
os.makedirs('text', exist_ok=True) | |
full_text = '' | |
if text_content: | |
with open('text/content.txt', 'w') as text_file: | |
for line in text_content: | |
text_file.write(line + '\n') | |
full_text += line + ' ' | |
# Initialize the summarization pipeline | |
summary = summarize_long_text(full_text) | |
return summary | |
def checker(url): | |
if not url: | |
raise Exception("URL cannot be empty.") | |
if not url.startswith("https://"): | |
raise Exception("The URL must begin with https://") | |
try: | |
summary_text = scrapper(url) | |
except requests.exceptions.HTTPError as e: | |
if e.response.status_code == 403: | |
raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.") | |
else: | |
raise Exception(f"HTTP Error: {e.response.status_code}") | |
except TypeError as e: | |
raise Exception(f"TypeError: {str(e)}") | |
except (requests.exceptions.RequestException, ValueError) as e: | |
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}") | |
if not summary_text: | |
raise Exception("Found no text.") | |
print(f"Returning summarized text from {url} ...") | |
return summary_text | |
with gr.Blocks(theme="dwancin/theme") as app: | |
title = gr.Markdown('''# Web Scraping 🕵️''') | |
description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''') | |
with gr.Row(): | |
with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"): | |
url_name = gr.Textbox( | |
placeholder="Enter URL here", | |
show_label=True, | |
label="Website", | |
) | |
submit_button = gr.Button( | |
"Submit", | |
variant="primary", | |
interactive=True, | |
) | |
with gr.Column(scale=2): | |
summary_output = gr.Textbox( | |
label="Summary", | |
elem_id="summary-text", | |
size="lg", | |
show_label=False, | |
readonly=True, | |
) | |
submit_button.click( | |
checker, | |
inputs=[url_name], | |
outputs=[summary_output], | |
) | |
app.launch() | |