import gradio as gr import requests from bs4 import BeautifulSoup from fpdf import FPDF import os import re from urllib.parse import urlparse from typing import List, Tuple import tempfile class ArticleExtractor: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def clean_text(self, text: str) -> str: """Clean extracted text by removing extra whitespace and special characters.""" # Remove extra whitespace and newlines text = re.sub(r'\s+', ' ', text).strip() # Remove special characters but keep basic punctuation text = re.sub(r'[^\w\s.,!?-]', '', text) return text def extract_content(self, url: str) -> Tuple[str, List[str], str]: """Extract title, headings, and main content from a webpage.""" try: response = requests.get(url, headers=self.headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract title title = soup.title.string if soup.title else "No title found" title = self.clean_text(title) # Extract headings headings = [] for heading in soup.find_all(['h1', 'h2', 'h3']): heading_text = self.clean_text(heading.get_text()) if heading_text and len(heading_text) > 5: # Filter out very short headings headings.append(heading_text) # Extract main content (paragraphs) # Remove unwanted elements for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']): unwanted.decompose() # Find article content or main content content = "" article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post')) if article: paragraphs = article.find_all('p') else: paragraphs = soup.find_all('p') content_parts = [] for p in paragraphs: text = self.clean_text(p.get_text()) if text and len(text) > 50: # Filter out short paragraphs content_parts.append(text) content = '\n\n'.join(content_parts) return title, headings, content except Exception as e: return f"Error: {str(e)}", [], "Failed to extract content" def create_pdf(self, url: str, output_dir: str) -> str: """Create a PDF document from extracted web content.""" title, headings, content = self.extract_content(url) # Create PDF pdf = FPDF() pdf.add_page() # Set up fonts pdf.set_font('Arial', 'B', 16) # Add title pdf.cell(0, 10, title[:80], ln=True) # Truncate very long titles pdf.ln(10) # Add headings pdf.set_font('Arial', 'B', 12) for heading in headings: pdf.multi_cell(0, 10, heading) pdf.ln(5) # Add content pdf.set_font('Arial', '', 11) pdf.multi_cell(0, 10, content) # Generate filename from URL filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf" filepath = os.path.join(output_dir, filename) # Save PDF pdf.output(filepath) return filepath def process_urls(urls: str) -> List[str]: """Process multiple URLs and return paths to generated PDFs.""" # Create temporary directory for PDFs temp_dir = tempfile.mkdtemp() # Split and clean URLs url_list = [url.strip() for url in urls.split('\n') if url.strip()] # Limit to 5 URLs url_list = url_list[:5] extractor = ArticleExtractor() pdf_paths = [] for url in url_list: try: pdf_path = extractor.create_pdf(url, temp_dir) pdf_paths.append(pdf_path) except Exception as e: print(f"Error processing {url}: {str(e)}") return pdf_paths # Create Gradio interface def gradio_interface(urls: str) -> List[str]: """Gradio interface function.""" return process_urls(urls) # Set up the Gradio app iface = gr.Interface( fn=gradio_interface, inputs=gr.Textbox( lines=5, placeholder="Enter up to 5 URLs (one per line)", label="URLs" ), outputs=gr.File( label="Downloaded PDFs", file_count="multiple" ), title="Web Content Extractor", description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.", examples=[ ["https://example.com/article1\nhttps://example.com/article2"] ] ) # Launch the app if __name__ == "__main__": iface.launch()