Spaces:

claytonsamples
/

newsletter

Sleeping

App Files Files Community

newsletter / app.py

claytonsamples

Update app.py

342a5a7 almost 2 years ago

raw

history blame

4.41 kB

	'''
	# Web Scrapping
	[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
	'''

	import os,re, requests, uuid, zipfile, hashlib, shutil
	import gradio as gr
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from transformers import pipeline
	import torch

	# Function to validate URLs
	def validator(url):
	parsed = urlparse(url)
	return bool(parsed.netloc) and bool(parsed.scheme)

	def finder(url, soup, media_type):
	files = []
	# Find text
	if media_type == "text":
	text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
	for tag in text_tags:
	for element in soup.find_all(tag):
	files.append(element.get_text())
	# Find links
	else:
	for link in soup.find_all('a'):
	file = link.get('href')
	if file and media_type in file:
	file_url = file
	if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
	file_url = urljoin(url, file_url)
	files.append(file_url)
	return files

	def summarize_long_text(text, chunk_size=1024):
	# Initialize the summarization pipeline
	summarizer = pipeline('summarization')

	# Tokenize the text into words
	words = text.split()

	# Split the words into chunks of the specified size
	chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

	# Summarize each chunk
	summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]

	# Combine the summarized chunks into the final summary
	final_summary = ' '.join(summarized_chunks)

	return final_summary

	def scrapper(url):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	except (requests.exceptions.RequestException, ValueError) as e:
	raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
	return None

	soup = BeautifulSoup(response.content, 'html.parser')

	# Add text files to the text folder
	text_content = finder(url, soup, 'text')
	os.makedirs('text', exist_ok=True)
	full_text = ''
	if text_content:
	with open('text/content.txt', 'w') as text_file:
	for line in text_content:
	text_file.write(line + '\n')
	full_text += line + ' '

	# Initialize the summarization pipeline
	summary = summarize_long_text(full_text)

	return summary

	def checker(url):
	if not url:
	raise Exception("URL cannot be empty.")
	if not url.startswith("https://"):
	raise Exception("The URL must begin with https://")

	try:
	summary_text = scrapper(url)
	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403:
	raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
	else:
	raise Exception(f"HTTP Error: {e.response.status_code}")
	except TypeError as e:
	raise Exception(f"TypeError: {str(e)}")
	except (requests.exceptions.RequestException, ValueError) as e:
	raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")

	if not summary_text:
	raise Exception("Found no text.")

	print(f"Returning summarized text from {url} ...")

	return summary_text

	with gr.Blocks(theme="dwancin/theme") as app:
	title = gr.Markdown('''# Web Scraping 🕵️''')
	description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
	with gr.Row():
	with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
	url_name = gr.Textbox(
	placeholder="Enter URL here",
	show_label=True,
	label="Website",
	)

	submit_button = gr.Button(
	"Submit",
	variant="primary",
	interactive=True,
	)

	with gr.Column(scale=2):
	summary_output = gr.Textbox(
	label="Summary",
	elem_id="summary-text",
	size="lg",
	show_label=False,
	readonly=True,
	)

	submit_button.click(
	checker,
	inputs=[url_name],
	outputs=[summary_output],
	)

	app.launch()