Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from fpdf import FPDF | |
import os | |
import re | |
from urllib.parse import urlparse | |
from typing import List, Tuple | |
import tempfile | |
class ArticleExtractor: | |
def __init__(self): | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
def clean_text(self, text: str) -> str: | |
"""Clean extracted text by removing extra whitespace and special characters.""" | |
# Remove extra whitespace and newlines | |
text = re.sub(r'\s+', ' ', text).strip() | |
# Remove special characters but keep basic punctuation | |
text = re.sub(r'[^\w\s.,!?-]', '', text) | |
return text | |
def extract_content(self, url: str) -> Tuple[str, List[str], str]: | |
"""Extract title, headings, and main content from a webpage.""" | |
try: | |
response = requests.get(url, headers=self.headers, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract title | |
title = soup.title.string if soup.title else "No title found" | |
title = self.clean_text(title) | |
# Extract headings | |
headings = [] | |
for heading in soup.find_all(['h1', 'h2', 'h3']): | |
heading_text = self.clean_text(heading.get_text()) | |
if heading_text and len(heading_text) > 5: # Filter out very short headings | |
headings.append(heading_text) | |
# Extract main content (paragraphs) | |
# Remove unwanted elements | |
for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']): | |
unwanted.decompose() | |
# Find article content or main content | |
content = "" | |
article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post')) | |
if article: | |
paragraphs = article.find_all('p') | |
else: | |
paragraphs = soup.find_all('p') | |
content_parts = [] | |
for p in paragraphs: | |
text = self.clean_text(p.get_text()) | |
if text and len(text) > 50: # Filter out short paragraphs | |
content_parts.append(text) | |
content = '\n\n'.join(content_parts) | |
return title, headings, content | |
except Exception as e: | |
return f"Error: {str(e)}", [], "Failed to extract content" | |
def create_pdf(self, url: str, output_dir: str) -> str: | |
"""Create a PDF document from extracted web content.""" | |
title, headings, content = self.extract_content(url) | |
# Create PDF | |
pdf = FPDF() | |
pdf.add_page() | |
# Set up fonts | |
pdf.set_font('Arial', 'B', 16) | |
# Add title | |
pdf.cell(0, 10, title[:80], ln=True) # Truncate very long titles | |
pdf.ln(10) | |
# Add headings | |
pdf.set_font('Arial', 'B', 12) | |
for heading in headings: | |
pdf.multi_cell(0, 10, heading) | |
pdf.ln(5) | |
# Add content | |
pdf.set_font('Arial', '', 11) | |
pdf.multi_cell(0, 10, content) | |
# Generate filename from URL | |
filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf" | |
filepath = os.path.join(output_dir, filename) | |
# Save PDF | |
pdf.output(filepath) | |
return filepath | |
def process_urls(urls: str) -> List[str]: | |
"""Process multiple URLs and return paths to generated PDFs.""" | |
# Create temporary directory for PDFs | |
temp_dir = tempfile.mkdtemp() | |
# Split and clean URLs | |
url_list = [url.strip() for url in urls.split('\n') if url.strip()] | |
# Limit to 5 URLs | |
url_list = url_list[:5] | |
extractor = ArticleExtractor() | |
pdf_paths = [] | |
for url in url_list: | |
try: | |
pdf_path = extractor.create_pdf(url, temp_dir) | |
pdf_paths.append(pdf_path) | |
except Exception as e: | |
print(f"Error processing {url}: {str(e)}") | |
return pdf_paths | |
# Create Gradio interface | |
def gradio_interface(urls: str) -> List[str]: | |
"""Gradio interface function.""" | |
return process_urls(urls) | |
# Set up the Gradio app | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=gr.Textbox( | |
lines=5, | |
placeholder="Enter up to 5 URLs (one per line)", | |
label="URLs" | |
), | |
outputs=gr.File( | |
label="Downloaded PDFs", | |
file_count="multiple" | |
), | |
title="Web Content Extractor", | |
description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.", | |
examples=[ | |
["https://example.com/article1\nhttps://example.com/article2"] | |
] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |