Spaces:

jeremierostan
/

url_scrape

Running

App Files Files Community

url_scrape / app.py

jeremierostan

Create app.py

c511484 verified 7 months ago

raw

history blame contribute delete

4.99 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from fpdf import FPDF
	import os
	import re
	from urllib.parse import urlparse
	from typing import List, Tuple
	import tempfile

	class ArticleExtractor:
	def __init__(self):
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	def clean_text(self, text: str) -> str:
	"""Clean extracted text by removing extra whitespace and special characters."""
	# Remove extra whitespace and newlines
	text = re.sub(r'\s+', ' ', text).strip()
	# Remove special characters but keep basic punctuation
	text = re.sub(r'[^\w\s.,!?-]', '', text)
	return text

	def extract_content(self, url: str) -> Tuple[str, List[str], str]:
	"""Extract title, headings, and main content from a webpage."""
	try:
	response = requests.get(url, headers=self.headers, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract title
	title = soup.title.string if soup.title else "No title found"
	title = self.clean_text(title)

	# Extract headings
	headings = []
	for heading in soup.find_all(['h1', 'h2', 'h3']):
	heading_text = self.clean_text(heading.get_text())
	if heading_text and len(heading_text) > 5: # Filter out very short headings
	headings.append(heading_text)

	# Extract main content (paragraphs)
	# Remove unwanted elements
	for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
	unwanted.decompose()

	# Find article content or main content
	content = ""
	article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content\|article\|post'))

	if article:
	paragraphs = article.find_all('p')
	else:
	paragraphs = soup.find_all('p')

	content_parts = []
	for p in paragraphs:
	text = self.clean_text(p.get_text())
	if text and len(text) > 50: # Filter out short paragraphs
	content_parts.append(text)

	content = '\n\n'.join(content_parts)

	return title, headings, content

	except Exception as e:
	return f"Error: {str(e)}", [], "Failed to extract content"

	def create_pdf(self, url: str, output_dir: str) -> str:
	"""Create a PDF document from extracted web content."""
	title, headings, content = self.extract_content(url)

	# Create PDF
	pdf = FPDF()
	pdf.add_page()

	# Set up fonts
	pdf.set_font('Arial', 'B', 16)

	# Add title
	pdf.cell(0, 10, title[:80], ln=True) # Truncate very long titles
	pdf.ln(10)

	# Add headings
	pdf.set_font('Arial', 'B', 12)
	for heading in headings:
	pdf.multi_cell(0, 10, heading)
	pdf.ln(5)

	# Add content
	pdf.set_font('Arial', '', 11)
	pdf.multi_cell(0, 10, content)

	# Generate filename from URL
	filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf"
	filepath = os.path.join(output_dir, filename)

	# Save PDF
	pdf.output(filepath)
	return filepath

	def process_urls(urls: str) -> List[str]:
	"""Process multiple URLs and return paths to generated PDFs."""
	# Create temporary directory for PDFs
	temp_dir = tempfile.mkdtemp()

	# Split and clean URLs
	url_list = [url.strip() for url in urls.split('\n') if url.strip()]

	# Limit to 5 URLs
	url_list = url_list[:5]

	extractor = ArticleExtractor()
	pdf_paths = []

	for url in url_list:
	try:
	pdf_path = extractor.create_pdf(url, temp_dir)
	pdf_paths.append(pdf_path)
	except Exception as e:
	print(f"Error processing {url}: {str(e)}")

	return pdf_paths

	# Create Gradio interface
	def gradio_interface(urls: str) -> List[str]:
	"""Gradio interface function."""
	return process_urls(urls)

	# Set up the Gradio app
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.Textbox(
	lines=5,
	placeholder="Enter up to 5 URLs (one per line)",
	label="URLs"
	),
	outputs=gr.File(
	label="Downloaded PDFs",
	file_count="multiple"
	),
	title="Web Content Extractor",
	description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.",
	examples=[
	["https://example.com/article1\nhttps://example.com/article2"]
	]
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()