''' # Web Scrapping [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping) ''' import os, re, requests, uuid, zipfile, hashlib, shutil import gradio as gr from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from transformers import pipeline, AutoTokenizer import torch # Function to validate URLs def validator(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def finder(url, soup, media_type): files = [] # Find text if media_type == "text": text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] for tag in text_tags: for element in soup.find_all(tag): files.append(element.get_text()) return files def summarize_long_text(text, model_name="facebook/bart-large-cnn", max_chunk_tokens=500): # Initialize the summarization pipeline summarizer = pipeline('summarization', model=model_name) # Initialize the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Tokenize the text tokens = tokenizer.encode(text) # Split the tokens into chunks of the specified size chunks = [tokens[i:i + max_chunk_tokens] for i in range(0, len(tokens), max_chunk_tokens)] # Summarize each chunk and combine the results final_summary = '' for chunk in chunks: chunk_text = tokenizer.decode(chunk) summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] final_summary += ' ' + summary return final_summary.strip() def scrapper(url): try: response = requests.get(url, timeout=10) response.raise_for_status() except (requests.exceptions.RequestException, ValueError) as e: raise Exception(f"Unable to access URL: {url}. Error: {str(e)}") return None soup = BeautifulSoup(response.content, 'html.parser') # Add text files to the text folder text_content = finder(url, soup, 'text') os.makedirs('text', exist_ok=True) full_text = ' '.join(text_content) # Join the text content into a single string # Save the full text to a file with open('text/content.txt', 'w') as text_file: text_file.write(full_text) # Summarize the text summary = summarize_long_text(full_text) return summary def checker(url): if not url: raise Exception("URL cannot be empty.") if not url.startswith("https://"): raise Exception("The URL must begin with https://") try: summary_text = scrapper(url) except requests.exceptions.HTTPError as e: if e.response.status_code == 403: raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.") else: raise Exception(f"HTTP Error: {e.response.status_code}") except TypeError as e: raise Exception(f"TypeError: {str(e)}") except (requests.exceptions.RequestException, ValueError) as e: raise Exception(f"Unable to access URL: {url}. Error: {str(e)}") if not summary_text: raise Exception("Found no text.") print(f"Returning summarized text from {url} ...") return summary_text with gr.Blocks(theme="dwancin/theme") as app: title = gr.Markdown('''# Web Scraping 🕵️''') description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''') with gr.Row(): with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"): url_name = gr.Textbox( placeholder="Enter URL here", show_label=True, label="Website", ) submit_button = gr.Button( "Submit", variant="primary", interactive=True, ) with gr.Column(scale=2): summary_output = gr.Textbox( label="Summary", elem_id="summary-text", size="lg", show_label=False, readonly=True, ) submit_button.click( checker, inputs=[url_name], outputs=[summary_output], ) app.launch()