Spaces:
Sleeping
Sleeping
File size: 4,412 Bytes
842d191 cea4b04 b4cf797 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 08a92ff ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 08a92ff ac6e8fe 342a5a7 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe 842d191 ac6e8fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
'''
# Web Scrapping
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
'''
import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from transformers import pipeline
import torch
# Function to validate URLs
def validator(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def finder(url, soup, media_type):
files = []
# Find text
if media_type == "text":
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
for tag in text_tags:
for element in soup.find_all(tag):
files.append(element.get_text())
# Find links
else:
for link in soup.find_all('a'):
file = link.get('href')
if file and media_type in file:
file_url = file
if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
file_url = urljoin(url, file_url)
files.append(file_url)
return files
def summarize_long_text(text, chunk_size=1024):
# Initialize the summarization pipeline
summarizer = pipeline('summarization')
# Tokenize the text into words
words = text.split()
# Split the words into chunks of the specified size
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
# Summarize each chunk
summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
# Combine the summarized chunks into the final summary
final_summary = ' '.join(summarized_chunks)
return final_summary
def scrapper(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.exceptions.RequestException, ValueError) as e:
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Add text files to the text folder
text_content = finder(url, soup, 'text')
os.makedirs('text', exist_ok=True)
full_text = ''
if text_content:
with open('text/content.txt', 'w') as text_file:
for line in text_content:
text_file.write(line + '\n')
full_text += line + ' '
# Initialize the summarization pipeline
summary = summarize_long_text(full_text)
return summary
def checker(url):
if not url:
raise Exception("URL cannot be empty.")
if not url.startswith("https://"):
raise Exception("The URL must begin with https://")
try:
summary_text = scrapper(url)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
else:
raise Exception(f"HTTP Error: {e.response.status_code}")
except TypeError as e:
raise Exception(f"TypeError: {str(e)}")
except (requests.exceptions.RequestException, ValueError) as e:
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
if not summary_text:
raise Exception("Found no text.")
print(f"Returning summarized text from {url} ...")
return summary_text
with gr.Blocks(theme="dwancin/theme") as app:
title = gr.Markdown('''# Web Scraping 🕵️''')
description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
with gr.Row():
with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
url_name = gr.Textbox(
placeholder="Enter URL here",
show_label=True,
label="Website",
)
submit_button = gr.Button(
"Submit",
variant="primary",
interactive=True,
)
with gr.Column(scale=2):
summary_output = gr.Textbox(
label="Summary",
elem_id="summary-text",
size="lg",
show_label=False,
readonly=True,
)
submit_button.click(
checker,
inputs=[url_name],
outputs=[summary_output],
)
app.launch()
|