cyberandy commited on
Commit
ab2a9d9
·
verified ·
1 Parent(s): 330f459

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -119
app.py CHANGED
@@ -4,119 +4,183 @@ from bs4 import BeautifulSoup
4
  import re
5
  from urllib.parse import urljoin, urlparse
6
  import markdown
 
 
 
 
 
7
 
8
- def get_website_title(soup):
9
- """Extract website title from meta tags or title tag"""
10
- # Try meta title first
11
- meta_title = soup.find('meta', property='og:title')
12
- if meta_title:
13
- return meta_title['content']
14
-
15
- # Try regular title tag
16
- title_tag = soup.find('title')
17
- if title_tag:
18
- return title_tag.text.strip()
19
-
20
- # Fallback to h1
21
- h1_tag = soup.find('h1')
22
- if h1_tag:
23
- return h1_tag.text.strip()
24
-
25
- return "Website Title"
26
-
27
- def get_website_description(soup):
28
- """Extract website description from meta tags"""
29
- # Try meta description
30
- meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
31
- if meta_desc:
32
- return meta_desc.get('content', '')
33
-
34
- # Fallback to first paragraph
35
- first_p = soup.find('p')
36
- if first_p:
37
- return first_p.text.strip()
38
-
39
- return "Website description"
40
-
41
- def get_important_links(soup, base_url):
42
- """Extract important links from the website"""
43
- links = []
44
- seen_urls = set()
45
-
46
- # Look for navigation links
47
- nav_elements = soup.find_all(['nav', 'header'])
48
- for nav in nav_elements:
49
- for a in nav.find_all('a', href=True):
50
- url = urljoin(base_url, a['href'])
51
- if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
52
- text = a.text.strip()
53
- if text and len(text) > 1: # Avoid empty or single-character links
54
- links.append({
55
- 'title': text,
56
- 'url': url,
57
- 'section': 'Docs'
58
- })
59
- seen_urls.add(url)
60
-
61
- # Look for footer links
62
- footer = soup.find('footer')
63
- if footer:
64
- for a in footer.find_all('a', href=True):
65
- url = urljoin(base_url, a['href'])
66
- if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
67
- text = a.text.strip()
68
- if text and len(text) > 1:
69
- links.append({
70
- 'title': text,
71
- 'url': url,
72
- 'section': 'Optional'
73
- })
74
- seen_urls.add(url)
75
-
76
- return links
77
 
78
- def generate_llms_txt(url):
79
- try:
80
- # Fetch the webpage
81
- headers = {
 
 
 
 
 
82
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
83
  }
84
- response = requests.get(url, headers=headers, timeout=10)
85
- response.raise_for_status()
86
-
87
- # Parse the HTML
88
- soup = BeautifulSoup(response.text, 'html.parser')
89
-
90
- # Get base components
91
- title = get_website_title(soup)
92
- description = get_website_description(soup)
93
- links = get_important_links(soup, url)
94
-
95
- # Generate llms.txt content
96
- content = [
97
- f"# {title}\n",
98
- f"> {description}\n",
99
- "## Docs\n"
100
- ]
101
-
102
- # Add documentation links
103
- doc_links = [link for link in links if link['section'] == 'Docs']
104
- for link in doc_links:
105
- content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- # Add optional links if present
108
- optional_links = [link for link in links if link['section'] == 'Optional']
109
- if optional_links:
110
- content.append("\n## Optional\n")
111
- for link in optional_links:
112
- content.append(f"- [{link['title']}]({link['url']})\n")
113
 
114
- # Join all content
115
- llms_txt_content = "\n".join(content)
116
 
117
- return llms_txt_content
118
- except Exception as e:
119
- return f"Error generating llms.txt: {str(e)}"
 
 
 
 
 
 
 
 
 
120
 
121
  def save_llms_txt(content, save_path="llms.txt"):
122
  """Save the generated content to a file"""
@@ -127,32 +191,62 @@ def save_llms_txt(content, save_path="llms.txt"):
127
  except Exception as e:
128
  return f"Error saving file: {str(e)}"
129
 
130
- # Create Gradio interface
131
- def process_url(url, save_to_file=False):
132
- content = generate_llms_txt(url)
133
- if save_to_file:
134
- save_message = save_llms_txt(content)
135
- return content, save_message
136
- return content, "File not saved (checkbox not selected)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  # Create the Gradio interface
139
  iface = gr.Interface(
140
- fn=process_url,
141
  inputs=[
142
  gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
 
 
143
  gr.Checkbox(label="Save to file", value=False)
144
  ],
145
  outputs=[
146
- gr.Textbox(label="Generated llms.txt Content", lines=10),
147
  gr.Textbox(label="Status")
148
  ],
149
  title="llms.txt Generator",
150
- description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
151
  examples=[
152
- ["https://example.com", False],
153
- ["https://docs.python.org", True]
154
  ],
155
- theme=gr.themes.Soft()
 
156
  )
157
 
158
  # Launch the app
 
4
  import re
5
  from urllib.parse import urljoin, urlparse
6
  import markdown
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import asyncio
9
+ from collections import defaultdict
10
+ import time
11
+ import logging
12
 
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ class WebsiteCrawler:
18
+ def __init__(self, max_depth=3, max_pages=50, timeout=30):
19
+ self.max_depth = max_depth
20
+ self.max_pages = max_pages
21
+ self.timeout = timeout
22
+ self.visited_urls = set()
23
+ self.url_content = {}
24
+ self.url_metadata = defaultdict(dict)
25
+ self.headers = {
26
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
27
  }
28
+
29
+ def is_valid_url(self, url, base_domain):
30
+ """Check if URL is valid and belongs to the same domain"""
31
+ try:
32
+ parsed = urlparse(url)
33
+ base_parsed = urlparse(base_domain)
34
+ return (parsed.netloc == base_parsed.netloc and
35
+ parsed.scheme in ['http', 'https'] and
36
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
37
+ except:
38
+ return False
39
+
40
+ def extract_content(self, soup):
41
+ """Extract meaningful content from HTML"""
42
+ # Remove script and style elements
43
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
44
+ element.decompose()
45
+
46
+ # Get main content
47
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
48
+ if main_content:
49
+ return main_content.get_text(strip=True)
50
+ return soup.get_text(strip=True)
51
+
52
+ def get_page_metadata(self, soup, url):
53
+ """Extract metadata from the page"""
54
+ metadata = {
55
+ 'title': None,
56
+ 'description': None,
57
+ 'importance': 0,
58
+ 'category': 'Optional'
59
+ }
60
+
61
+ # Title extraction
62
+ metadata['title'] = (
63
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
64
+ soup.find('title').text if soup.find('title') else
65
+ soup.find('h1').text if soup.find('h1') else
66
+ url.split('/')[-1]
67
+ )
68
+
69
+ # Description extraction
70
+ metadata['description'] = (
71
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
72
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
73
+ ""
74
+ )
75
+
76
+ # Calculate importance based on various factors
77
+ importance = 0
78
+ if 'docs' in url.lower() or 'documentation' in url.lower():
79
+ importance += 5
80
+ metadata['category'] = 'Docs'
81
+ if 'api' in url.lower():
82
+ importance += 4
83
+ metadata['category'] = 'API'
84
+ if 'guide' in url.lower() or 'tutorial' in url.lower():
85
+ importance += 3
86
+ metadata['category'] = 'Guides'
87
+ if 'example' in url.lower():
88
+ importance += 2
89
+ metadata['category'] = 'Examples'
90
+ if 'blog' in url.lower():
91
+ importance += 1
92
+ metadata['category'] = 'Blog'
93
+
94
+ metadata['importance'] = importance
95
+ return metadata
96
+
97
+ async def crawl_page(self, url, depth, base_domain):
98
+ """Crawl a single page and extract information"""
99
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
100
+ return []
101
+
102
+ try:
103
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
104
+ response.raise_for_status()
105
+ self.visited_urls.add(url)
106
+
107
+ soup = BeautifulSoup(response.text, 'html.parser')
108
+ content = self.extract_content(soup)
109
+ metadata = self.get_page_metadata(soup, url)
110
+
111
+ self.url_content[url] = content
112
+ self.url_metadata[url] = metadata
113
+
114
+ # Find all links
115
+ links = []
116
+ for a in soup.find_all('a', href=True):
117
+ next_url = urljoin(url, a['href'])
118
+ if self.is_valid_url(next_url, base_domain):
119
+ links.append(next_url)
120
+
121
+ return links
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error crawling {url}: {str(e)}")
125
+ return []
126
+
127
+ async def crawl_website(self, start_url):
128
+ """Crawl website starting from the given URL"""
129
+ base_domain = start_url
130
+ queue = [(start_url, 0)]
131
+ seen = {start_url}
132
+
133
+ while queue and len(self.visited_urls) < self.max_pages:
134
+ current_url, depth = queue.pop(0)
135
+
136
+ if depth > self.max_depth:
137
+ continue
138
+
139
+ links = await self.crawl_page(current_url, depth, base_domain)
140
+
141
+ for link in links:
142
+ if link not in seen:
143
+ seen.add(link)
144
+ queue.append((link, depth + 1))
145
+
146
+ def generate_llms_txt(self):
147
+ """Generate llms.txt content from crawled data"""
148
+ # Sort URLs by importance
149
+ sorted_urls = sorted(
150
+ self.url_metadata.items(),
151
+ key=lambda x: (x[1]['importance'], x[0]),
152
+ reverse=True
153
+ )
154
+
155
+ # Group URLs by category
156
+ categorized_urls = defaultdict(list)
157
+ for url, metadata in sorted_urls:
158
+ categorized_urls[metadata['category']].append((url, metadata))
159
+
160
+ # Generate content
161
+ content = []
162
 
163
+ # Add main title and description
164
+ if sorted_urls:
165
+ main_metadata = sorted_urls[0][1]
166
+ content.append(f"# {main_metadata['title']}\n")
167
+ content.append(f"> {main_metadata['description']}\n")
 
168
 
169
+ # Add categorized sections
170
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
171
 
172
+ for category in priority_order:
173
+ if category in categorized_urls:
174
+ content.append(f"\n## {category}\n")
175
+ for url, metadata in categorized_urls[category]:
176
+ title = metadata['title']
177
+ desc = metadata['description']
178
+ if desc:
179
+ content.append(f"- [{title}]({url}): {desc[:100]}...\n")
180
+ else:
181
+ content.append(f"- [{title}]({url})\n")
182
+
183
+ return "\n".join(content)
184
 
185
  def save_llms_txt(content, save_path="llms.txt"):
186
  """Save the generated content to a file"""
 
191
  except Exception as e:
192
  return f"Error saving file: {str(e)}"
193
 
194
+ async def process_url(url, max_depth, max_pages, save_to_file=False):
195
+ """Process URL and generate llms.txt"""
196
+ try:
197
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
198
+ await crawler.crawl_website(url)
199
+ content = crawler.generate_llms_txt()
200
+
201
+ if save_to_file:
202
+ save_message = save_llms_txt(content)
203
+ return content, f"Crawled {len(crawler.visited_urls)} pages. {save_message}"
204
+
205
+ return content, f"Crawled {len(crawler.visited_urls)} pages. File not saved (checkbox not selected)"
206
+
207
+ except Exception as e:
208
+ return "", f"Error: {str(e)}"
209
+
210
+ # Create the Gradio interface with custom CSS for Open Sans font
211
+ css = """
212
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
213
+
214
+ body, .gradio-container {
215
+ font-family: 'Open Sans', sans-serif !important;
216
+ }
217
+
218
+ .gr-box {
219
+ border-radius: 8px !important;
220
+ border: 1px solid #e5e7eb !important;
221
+ }
222
+
223
+ .gr-button {
224
+ font-family: 'Open Sans', sans-serif !important;
225
+ font-weight: 600 !important;
226
+ }
227
+ """
228
 
229
  # Create the Gradio interface
230
  iface = gr.Interface(
231
+ fn=lambda url, max_depth, max_pages, save: asyncio.run(process_url(url, max_depth, max_pages, save)),
232
  inputs=[
233
  gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
234
+ gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"),
235
+ gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages to Crawl"),
236
  gr.Checkbox(label="Save to file", value=False)
237
  ],
238
  outputs=[
239
+ gr.Textbox(label="Generated llms.txt Content", lines=20),
240
  gr.Textbox(label="Status")
241
  ],
242
  title="llms.txt Generator",
243
+ description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
244
  examples=[
245
+ ["https://example.com", 3, 50, False],
246
+ ["https://docs.python.org", 3, 50, True]
247
  ],
248
+ theme=gr.themes.Soft(),
249
+ css=css
250
  )
251
 
252
  # Launch the app