url_scrape / app.py
jeremierostan's picture
Create app.py
c511484 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import os
import re
from urllib.parse import urlparse
from typing import List, Tuple
import tempfile
class ArticleExtractor:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def clean_text(self, text: str) -> str:
"""Clean extracted text by removing extra whitespace and special characters."""
# Remove extra whitespace and newlines
text = re.sub(r'\s+', ' ', text).strip()
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text
def extract_content(self, url: str) -> Tuple[str, List[str], str]:
"""Extract title, headings, and main content from a webpage."""
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title
title = soup.title.string if soup.title else "No title found"
title = self.clean_text(title)
# Extract headings
headings = []
for heading in soup.find_all(['h1', 'h2', 'h3']):
heading_text = self.clean_text(heading.get_text())
if heading_text and len(heading_text) > 5: # Filter out very short headings
headings.append(heading_text)
# Extract main content (paragraphs)
# Remove unwanted elements
for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
unwanted.decompose()
# Find article content or main content
content = ""
article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post'))
if article:
paragraphs = article.find_all('p')
else:
paragraphs = soup.find_all('p')
content_parts = []
for p in paragraphs:
text = self.clean_text(p.get_text())
if text and len(text) > 50: # Filter out short paragraphs
content_parts.append(text)
content = '\n\n'.join(content_parts)
return title, headings, content
except Exception as e:
return f"Error: {str(e)}", [], "Failed to extract content"
def create_pdf(self, url: str, output_dir: str) -> str:
"""Create a PDF document from extracted web content."""
title, headings, content = self.extract_content(url)
# Create PDF
pdf = FPDF()
pdf.add_page()
# Set up fonts
pdf.set_font('Arial', 'B', 16)
# Add title
pdf.cell(0, 10, title[:80], ln=True) # Truncate very long titles
pdf.ln(10)
# Add headings
pdf.set_font('Arial', 'B', 12)
for heading in headings:
pdf.multi_cell(0, 10, heading)
pdf.ln(5)
# Add content
pdf.set_font('Arial', '', 11)
pdf.multi_cell(0, 10, content)
# Generate filename from URL
filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf"
filepath = os.path.join(output_dir, filename)
# Save PDF
pdf.output(filepath)
return filepath
def process_urls(urls: str) -> List[str]:
"""Process multiple URLs and return paths to generated PDFs."""
# Create temporary directory for PDFs
temp_dir = tempfile.mkdtemp()
# Split and clean URLs
url_list = [url.strip() for url in urls.split('\n') if url.strip()]
# Limit to 5 URLs
url_list = url_list[:5]
extractor = ArticleExtractor()
pdf_paths = []
for url in url_list:
try:
pdf_path = extractor.create_pdf(url, temp_dir)
pdf_paths.append(pdf_path)
except Exception as e:
print(f"Error processing {url}: {str(e)}")
return pdf_paths
# Create Gradio interface
def gradio_interface(urls: str) -> List[str]:
"""Gradio interface function."""
return process_urls(urls)
# Set up the Gradio app
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.Textbox(
lines=5,
placeholder="Enter up to 5 URLs (one per line)",
label="URLs"
),
outputs=gr.File(
label="Downloaded PDFs",
file_count="multiple"
),
title="Web Content Extractor",
description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.",
examples=[
["https://example.com/article1\nhttps://example.com/article2"]
]
)
# Launch the app
if __name__ == "__main__":
iface.launch()