web_scraper

Running

App Files Files Community

web_scraper / app.py

Marcepelaez

app

2df3230 verified 28 days ago

raw

history blame

5.72 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import re
	import os
	from urllib.parse import urljoin

	def scrape_web_content(url):
	"""
	Scrape the web content while preserving its original formatting

	Args:
	url (str): URL of the webpage

	Returns:
	dict: Extracted content with text, HTML, and images
	"""
	try:
	# Send a request to the URL
	response = requests.get(url)
	response.raise_for_status()

	# Parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Create a directory to save images if it doesn't exist
	os.makedirs('downloaded_images', exist_ok=True)

	# Download images
	downloaded_images = []
	img_tags = soup.find_all('img', src=True)
	for i, img in enumerate(img_tags[:10], 1):
	try:
	# Get the image source URL
	img_url = img['src']

	# Handle relative URLs
	if not img_url.startswith(('http://', 'https://')):
	img_url = urljoin(url, img_url)

	# Download the image
	img_response = requests.get(img_url)
	img_response.raise_for_status()

	# Generate a unique filename
	filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'

	# Save the image
	with open(filename, 'wb') as f:
	f.write(img_response.content)

	# Update the image tag in the soup to point to local file
	img['src'] = filename
	downloaded_images.append(filename)

	except Exception as img_error:
	st.warning(f"Could not download image {i}: {img_error}")

	# Remove unwanted tags
	for tag in soup(["script", "style", "meta", "link", "noscript"]):
	tag.decompose()

	# Convert remaining soup to HTML string
	formatted_html = str(soup)

	# Extract plain text for preview
	plain_text = soup.get_text(separator='\n', strip=True)

	return {
	'html': formatted_html,
	'plain_text': plain_text,
	'images': downloaded_images
	}

	except Exception as e:
	st.error(f"Error occurred while scraping the content: {e}")
	return None

	def main():
	"""
	Main Streamlit application
	"""
	st.title("Web Content Scraper with Preserved Formatting")

	# Get the URL from the user
	url_input = st.text_input("Enter the URL of the web page:", "")

	# Option to choose display mode
	display_mode = st.radio("Display Mode:",
	["Full HTML", "Plain Text", "Side-by-Side"])

	if st.button("Scrape Content"):
	if url_input:
	# Scrape the content
	scraped_content = scrape_web_content(url_input)

	if scraped_content:
	st.success("Content successfully scraped!")

	# Display content based on selected mode
	if display_mode == "Full HTML":
	# Display full HTML with preserved formatting
	st.markdown("### Formatted Web Content")
	st.components.v1.html(scraped_content['html'], height=600, scrolling=True)

	elif display_mode == "Plain Text":
	# Display plain text
	st.markdown("### Plain Text Content")
	st.text_area("Scraped Text:", scraped_content['plain_text'], height=400)

	else: # Side-by-Side
	# Split the screen to show HTML and plain text
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### Formatted HTML")
	st.components.v1.html(scraped_content['html'], height=600, scrolling=True)

	with col2:
	st.markdown("### Plain Text")
	st.text_area("Scraped Text:", scraped_content['plain_text'], height=600)

	# Display images
	if scraped_content['images']:
	st.subheader("Downloaded Images")
	cols = st.columns(min(len(scraped_content['images']), 3))
	for i, img_path in enumerate(scraped_content['images']):
	with cols[i % 3]:
	st.image(img_path, use_column_width=True)

	# Zip and download option for images
	with open('downloaded_images.zip', 'wb') as zipf:
	import zipfile
	with zipfile.ZipFile(zipf, 'w') as zip_file:
	for img_path in scraped_content['images']:
	zip_file.write(img_path, os.path.basename(img_path))

	st.download_button(
	label="Download All Images",
	data=open('downloaded_images.zip', 'rb').read(),
	file_name='downloaded_images.zip',
	mime='application/zip'
	)
	else:
	st.warning("Failed to scrape content from the URL.")
	else:
	st.warning("Please enter a valid URL.")

	if __name__ == "__main__":
	main()