Spaces:

jeremierostan
/

url_scrape

Running

App Files Files Community

jeremierostan commited on Dec 8, 2024

Commit

c511484

verified ·

1 Parent(s): fecb781

Create app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from fpdf import FPDF
+import os
+import re
+from urllib.parse import urlparse
+from typing import List, Tuple
+import tempfile
+class ArticleExtractor:
+    def __init__(self):
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    def clean_text(self, text: str) -> str:
+        """Clean extracted text by removing extra whitespace and special characters."""
+        # Remove extra whitespace and newlines
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Remove special characters but keep basic punctuation
+        text = re.sub(r'[^\w\s.,!?-]', '', text)
+        return text
+    def extract_content(self, url: str) -> Tuple[str, List[str], str]:
+        """Extract title, headings, and main content from a webpage."""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract title
+            title = soup.title.string if soup.title else "No title found"
+            title = self.clean_text(title)
+            # Extract headings
+            headings = []
+            for heading in soup.find_all(['h1', 'h2', 'h3']):
+                heading_text = self.clean_text(heading.get_text())
+                if heading_text and len(heading_text) > 5:  # Filter out very short headings
+                    headings.append(heading_text)
+            # Extract main content (paragraphs)
+            # Remove unwanted elements
+            for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+                unwanted.decompose()
+            # Find article content or main content
+            content = ""
+            article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post'))
+            if article:
+                paragraphs = article.find_all('p')
+            else:
+                paragraphs = soup.find_all('p')
+            content_parts = []
+            for p in paragraphs:
+                text = self.clean_text(p.get_text())
+                if text and len(text) > 50:  # Filter out short paragraphs
+                    content_parts.append(text)
+            content = '\n\n'.join(content_parts)
+            return title, headings, content
+        except Exception as e:
+            return f"Error: {str(e)}", [], "Failed to extract content"
+    def create_pdf(self, url: str, output_dir: str) -> str:
+        """Create a PDF document from extracted web content."""
+        title, headings, content = self.extract_content(url)
+        # Create PDF
+        pdf = FPDF()
+        pdf.add_page()
+        # Set up fonts
+        pdf.set_font('Arial', 'B', 16)
+        # Add title
+        pdf.cell(0, 10, title[:80], ln=True)  # Truncate very long titles
+        pdf.ln(10)
+        # Add headings
+        pdf.set_font('Arial', 'B', 12)
+        for heading in headings:
+            pdf.multi_cell(0, 10, heading)
+            pdf.ln(5)
+        # Add content
+        pdf.set_font('Arial', '', 11)
+        pdf.multi_cell(0, 10, content)
+        # Generate filename from URL
+        filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf"
+        filepath = os.path.join(output_dir, filename)
+        # Save PDF
+        pdf.output(filepath)
+        return filepath
+def process_urls(urls: str) -> List[str]:
+    """Process multiple URLs and return paths to generated PDFs."""
+    # Create temporary directory for PDFs
+    temp_dir = tempfile.mkdtemp()
+    # Split and clean URLs
+    url_list = [url.strip() for url in urls.split('\n') if url.strip()]
+    # Limit to 5 URLs
+    url_list = url_list[:5]
+    extractor = ArticleExtractor()
+    pdf_paths = []
+    for url in url_list:
+        try:
+            pdf_path = extractor.create_pdf(url, temp_dir)
+            pdf_paths.append(pdf_path)
+        except Exception as e:
+            print(f"Error processing {url}: {str(e)}")
+    return pdf_paths
+# Create Gradio interface
+def gradio_interface(urls: str) -> List[str]:
+    """Gradio interface function."""
+    return process_urls(urls)
+# Set up the Gradio app
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.Textbox(
+        lines=5,
+        placeholder="Enter up to 5 URLs (one per line)",
+        label="URLs"
+    ),
+    outputs=gr.File(
+        label="Downloaded PDFs",
+        file_count="multiple"
+    ),
+    title="Web Content Extractor",
+    description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.",
+    examples=[
+        ["https://example.com/article1\nhttps://example.com/article2"]
+    ]
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()