jeremierostan commited on
Commit
c511484
1 Parent(s): fecb781

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from fpdf import FPDF
5
+ import os
6
+ import re
7
+ from urllib.parse import urlparse
8
+ from typing import List, Tuple
9
+ import tempfile
10
+
11
+ class ArticleExtractor:
12
+ def __init__(self):
13
+ self.headers = {
14
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
15
+ }
16
+
17
+ def clean_text(self, text: str) -> str:
18
+ """Clean extracted text by removing extra whitespace and special characters."""
19
+ # Remove extra whitespace and newlines
20
+ text = re.sub(r'\s+', ' ', text).strip()
21
+ # Remove special characters but keep basic punctuation
22
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
23
+ return text
24
+
25
+ def extract_content(self, url: str) -> Tuple[str, List[str], str]:
26
+ """Extract title, headings, and main content from a webpage."""
27
+ try:
28
+ response = requests.get(url, headers=self.headers, timeout=10)
29
+ response.raise_for_status()
30
+ soup = BeautifulSoup(response.text, 'html.parser')
31
+
32
+ # Extract title
33
+ title = soup.title.string if soup.title else "No title found"
34
+ title = self.clean_text(title)
35
+
36
+ # Extract headings
37
+ headings = []
38
+ for heading in soup.find_all(['h1', 'h2', 'h3']):
39
+ heading_text = self.clean_text(heading.get_text())
40
+ if heading_text and len(heading_text) > 5: # Filter out very short headings
41
+ headings.append(heading_text)
42
+
43
+ # Extract main content (paragraphs)
44
+ # Remove unwanted elements
45
+ for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
46
+ unwanted.decompose()
47
+
48
+ # Find article content or main content
49
+ content = ""
50
+ article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post'))
51
+
52
+ if article:
53
+ paragraphs = article.find_all('p')
54
+ else:
55
+ paragraphs = soup.find_all('p')
56
+
57
+ content_parts = []
58
+ for p in paragraphs:
59
+ text = self.clean_text(p.get_text())
60
+ if text and len(text) > 50: # Filter out short paragraphs
61
+ content_parts.append(text)
62
+
63
+ content = '\n\n'.join(content_parts)
64
+
65
+ return title, headings, content
66
+
67
+ except Exception as e:
68
+ return f"Error: {str(e)}", [], "Failed to extract content"
69
+
70
+ def create_pdf(self, url: str, output_dir: str) -> str:
71
+ """Create a PDF document from extracted web content."""
72
+ title, headings, content = self.extract_content(url)
73
+
74
+ # Create PDF
75
+ pdf = FPDF()
76
+ pdf.add_page()
77
+
78
+ # Set up fonts
79
+ pdf.set_font('Arial', 'B', 16)
80
+
81
+ # Add title
82
+ pdf.cell(0, 10, title[:80], ln=True) # Truncate very long titles
83
+ pdf.ln(10)
84
+
85
+ # Add headings
86
+ pdf.set_font('Arial', 'B', 12)
87
+ for heading in headings:
88
+ pdf.multi_cell(0, 10, heading)
89
+ pdf.ln(5)
90
+
91
+ # Add content
92
+ pdf.set_font('Arial', '', 11)
93
+ pdf.multi_cell(0, 10, content)
94
+
95
+ # Generate filename from URL
96
+ filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf"
97
+ filepath = os.path.join(output_dir, filename)
98
+
99
+ # Save PDF
100
+ pdf.output(filepath)
101
+ return filepath
102
+
103
+ def process_urls(urls: str) -> List[str]:
104
+ """Process multiple URLs and return paths to generated PDFs."""
105
+ # Create temporary directory for PDFs
106
+ temp_dir = tempfile.mkdtemp()
107
+
108
+ # Split and clean URLs
109
+ url_list = [url.strip() for url in urls.split('\n') if url.strip()]
110
+
111
+ # Limit to 5 URLs
112
+ url_list = url_list[:5]
113
+
114
+ extractor = ArticleExtractor()
115
+ pdf_paths = []
116
+
117
+ for url in url_list:
118
+ try:
119
+ pdf_path = extractor.create_pdf(url, temp_dir)
120
+ pdf_paths.append(pdf_path)
121
+ except Exception as e:
122
+ print(f"Error processing {url}: {str(e)}")
123
+
124
+ return pdf_paths
125
+
126
+ # Create Gradio interface
127
+ def gradio_interface(urls: str) -> List[str]:
128
+ """Gradio interface function."""
129
+ return process_urls(urls)
130
+
131
+ # Set up the Gradio app
132
+ iface = gr.Interface(
133
+ fn=gradio_interface,
134
+ inputs=gr.Textbox(
135
+ lines=5,
136
+ placeholder="Enter up to 5 URLs (one per line)",
137
+ label="URLs"
138
+ ),
139
+ outputs=gr.File(
140
+ label="Downloaded PDFs",
141
+ file_count="multiple"
142
+ ),
143
+ title="Web Content Extractor",
144
+ description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.",
145
+ examples=[
146
+ ["https://example.com/article1\nhttps://example.com/article2"]
147
+ ]
148
+ )
149
+
150
+ # Launch the app
151
+ if __name__ == "__main__":
152
+ iface.launch()