cyberandy commited on
Commit
4124a56
1 Parent(s): 26fd883

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -87
app.py CHANGED
@@ -34,16 +34,40 @@ class WebsiteCrawler:
34
  # Normalize unicode characters
35
  text = unicodedata.normalize('NFKD', text)
36
  # Replace special quotes and dashes with standard characters
37
- # Replace fancy quotes and dashes with standard ones
38
  text = text.replace('\u201c', '"').replace('\u201d', '"') # smart quotes
39
  text = text.replace('\u2018', "'").replace('\u2019', "'") # smart single quotes
40
  text = text.replace('\u2013', '-').replace('\u2014', '-') # en and em dashes
41
  # Remove any remaining non-ASCII characters
42
  text = text.encode('ascii', 'ignore').decode('ascii')
43
- # Clean up extra whitespace
44
  text = ' '.join(text.split())
45
  return text
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def is_valid_url(self, url, base_domain):
48
  """Check if URL is valid and belongs to the same domain"""
49
  try:
@@ -76,42 +100,41 @@ class WebsiteCrawler:
76
  'category': 'Optional'
77
  }
78
 
79
- # Title extraction with normalization
80
  title = (
81
  soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
82
  soup.find('title').text if soup.find('title') else
83
  soup.find('h1').text if soup.find('h1') else
84
  url.split('/')[-1]
85
  )
86
- metadata['title'] = self.normalize_text(title)
87
 
88
- # Description extraction with normalization
89
  description = (
90
  soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
91
  soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
92
  ""
93
  )
94
- metadata['description'] = self.normalize_text(description)
95
 
96
- # Calculate importance based on various factors
97
- importance = 0
98
- if 'docs' in url.lower() or 'documentation' in url.lower():
99
- importance += 5
100
  metadata['category'] = 'Docs'
101
- if 'api' in url.lower():
102
- importance += 4
103
  metadata['category'] = 'API'
104
- if 'guide' in url.lower() or 'tutorial' in url.lower():
105
- importance += 3
106
  metadata['category'] = 'Guides'
107
- if 'example' in url.lower():
108
- importance += 2
109
  metadata['category'] = 'Examples'
110
- if 'blog' in url.lower():
111
- importance += 1
112
  metadata['category'] = 'Blog'
113
 
114
- metadata['importance'] = importance
115
  return metadata
116
 
117
  async def crawl_page(self, url, depth, base_domain):
@@ -121,7 +144,7 @@ class WebsiteCrawler:
121
 
122
  try:
123
  response = requests.get(url, headers=self.headers, timeout=self.timeout)
124
- response.encoding = 'utf-8' # Explicitly set encoding
125
  response.raise_for_status()
126
  self.visited_urls.add(url)
127
 
@@ -173,6 +196,9 @@ class WebsiteCrawler:
173
  reverse=True
174
  )
175
 
 
 
 
176
  # Group URLs by category
177
  categorized_urls = defaultdict(list)
178
  for url, metadata in sorted_urls:
@@ -182,24 +208,22 @@ class WebsiteCrawler:
182
  content = []
183
 
184
  # Add main title and description
185
- if sorted_urls:
186
- main_metadata = sorted_urls[0][1]
187
- content.append(f"# {main_metadata['title']}\n")
188
- content.append(f"> {main_metadata['description']}\n")
189
 
190
  # Add categorized sections
191
  priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
192
 
193
  for category in priority_order:
194
  if category in categorized_urls:
195
- content.append(f"\n## {category}\n")
196
  for url, metadata in categorized_urls[category]:
197
- title = metadata['title']
198
- desc = metadata['description']
199
- if desc:
200
- content.append(f"- [{title}]({url}): {desc[:100]}...\n")
201
  else:
202
- content.append(f"- [{title}]({url})\n")
203
 
204
  return "\n".join(content)
205
 
@@ -223,73 +247,92 @@ async def process_url(url, max_depth, max_pages):
223
  await crawler.crawl_website(url)
224
  content = crawler.generate_llms_txt()
225
 
226
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages. You can now copy the generated content."
227
 
228
  except Exception as e:
229
  return "", f"Error: {str(e)}"
230
 
231
- # Create the Gradio interface with custom CSS for Open Sans font
232
- css = """
233
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
234
-
235
- body, .gradio-container {
236
- font-family: 'Open Sans', sans-serif !important;
237
- }
238
-
239
- .gr-box {
240
- border-radius: 8px !important;
241
- border: 1px solid #e5e7eb !important;
242
- }
243
-
244
- .gr-button {
245
- font-family: 'Open Sans', sans-serif !important;
246
- font-weight: 600 !important;
247
- }
248
-
249
- .gr-input {
250
- font-family: 'Open Sans', sans-serif !important;
251
- }
252
- """
253
 
254
  # Create the Gradio interface
255
- iface = gr.Interface(
256
- fn=lambda url, max_depth, max_pages: asyncio.run(process_url(url, max_depth, max_pages)),
257
- inputs=[
258
- gr.Textbox(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  label="Website URL",
260
  placeholder="Enter the website URL (e.g., example.com or https://example.com)",
261
  info="The URL will be automatically prefixed with https:// if no protocol is specified."
262
- ),
263
- gr.Slider(
264
- minimum=1,
265
- maximum=5,
266
- value=3,
267
- step=1,
268
- label="Maximum Crawl Depth",
269
- info="Higher values will result in more thorough but slower crawling"
270
- ),
271
- gr.Slider(
272
- minimum=10,
273
- maximum=100,
274
- value=50,
275
- step=10,
276
- label="Maximum Pages to Crawl",
277
- info="Higher values will result in more comprehensive but slower results"
278
  )
279
- ],
280
- outputs=[
281
- gr.Textbox(
282
- label="Generated llms.txt Content",
283
- lines=20,
284
- info="Copy this content to create your llms.txt file"
285
- ),
286
- gr.Textbox(label="Status")
287
- ],
288
- title="llms.txt Generator",
289
- description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
290
- theme=gr.themes.Soft(),
291
- css=css
292
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  # Launch the app
295
  if __name__ == "__main__":
 
34
  # Normalize unicode characters
35
  text = unicodedata.normalize('NFKD', text)
36
  # Replace special quotes and dashes with standard characters
 
37
  text = text.replace('\u201c', '"').replace('\u201d', '"') # smart quotes
38
  text = text.replace('\u2018', "'").replace('\u2019', "'") # smart single quotes
39
  text = text.replace('\u2013', '-').replace('\u2014', '-') # en and em dashes
40
  # Remove any remaining non-ASCII characters
41
  text = text.encode('ascii', 'ignore').decode('ascii')
42
+ # Clean up extra whitespace and ensure proper sentence spacing
43
  text = ' '.join(text.split())
44
  return text
45
 
46
+ def clean_title(self, title):
47
+ """Clean and format titles"""
48
+ title = self.normalize_text(title)
49
+ # Remove common suffixes
50
+ title = re.sub(r'\s*\|\s*.*$', '', title) # Remove pipe and everything after
51
+ title = re.sub(r'\s*-\s*.*$', '', title) # Remove dash and everything after
52
+ title = title.strip()
53
+ return title
54
+
55
+ def clean_description(self, desc):
56
+ """Clean and format descriptions"""
57
+ if not desc:
58
+ return ""
59
+ desc = self.normalize_text(desc)
60
+ # Find the last complete sentence
61
+ sentences = re.split(r'(?<=[.!?])\s+', desc)
62
+ if sentences:
63
+ # Take up to two complete sentences
64
+ cleaned_desc = ' '.join(sentences[:2]).strip()
65
+ # Ensure it ends with proper punctuation
66
+ if not cleaned_desc[-1] in '.!?':
67
+ cleaned_desc += '.'
68
+ return cleaned_desc
69
+ return desc
70
+
71
  def is_valid_url(self, url, base_domain):
72
  """Check if URL is valid and belongs to the same domain"""
73
  try:
 
100
  'category': 'Optional'
101
  }
102
 
103
+ # Title extraction with cleaning
104
  title = (
105
  soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
106
  soup.find('title').text if soup.find('title') else
107
  soup.find('h1').text if soup.find('h1') else
108
  url.split('/')[-1]
109
  )
110
+ metadata['title'] = self.clean_title(title)
111
 
112
+ # Description extraction with cleaning
113
  description = (
114
  soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
115
  soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
116
  ""
117
  )
118
+ metadata['description'] = self.clean_description(description)
119
 
120
+ # Calculate importance and category
121
+ url_lower = url.lower()
122
+ if 'docs' in url_lower or 'documentation' in url_lower:
123
+ metadata['importance'] = 5
124
  metadata['category'] = 'Docs'
125
+ elif 'api' in url_lower:
126
+ metadata['importance'] = 4
127
  metadata['category'] = 'API'
128
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
129
+ metadata['importance'] = 3
130
  metadata['category'] = 'Guides'
131
+ elif 'example' in url_lower:
132
+ metadata['importance'] = 2
133
  metadata['category'] = 'Examples'
134
+ elif 'blog' in url_lower:
135
+ metadata['importance'] = 1
136
  metadata['category'] = 'Blog'
137
 
 
138
  return metadata
139
 
140
  async def crawl_page(self, url, depth, base_domain):
 
144
 
145
  try:
146
  response = requests.get(url, headers=self.headers, timeout=self.timeout)
147
+ response.encoding = 'utf-8'
148
  response.raise_for_status()
149
  self.visited_urls.add(url)
150
 
 
196
  reverse=True
197
  )
198
 
199
+ if not sorted_urls:
200
+ return "No content was found to generate llms.txt"
201
+
202
  # Group URLs by category
203
  categorized_urls = defaultdict(list)
204
  for url, metadata in sorted_urls:
 
208
  content = []
209
 
210
  # Add main title and description
211
+ main_metadata = sorted_urls[0][1]
212
+ content.append(f"# {main_metadata['title']}")
213
+ if main_metadata['description']:
214
+ content.append(f"\n> {main_metadata['description']}")
215
 
216
  # Add categorized sections
217
  priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
218
 
219
  for category in priority_order:
220
  if category in categorized_urls:
221
+ content.append(f"\n## {category}")
222
  for url, metadata in categorized_urls[category]:
223
+ if metadata['description']:
224
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
 
 
225
  else:
226
+ content.append(f"\n- [{metadata['title']}]({url})")
227
 
228
  return "\n".join(content)
229
 
 
247
  await crawler.crawl_website(url)
248
  content = crawler.generate_llms_txt()
249
 
250
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
251
 
252
  except Exception as e:
253
  return "", f"Error: {str(e)}"
254
 
255
+ # Create custom theme with Open Sans
256
+ theme = gr.themes.Soft().set(
257
+ font=["Open Sans", "ui-sans-serif", "system-ui", "sans-serif"],
258
+ font_mono=["IBM Plex Mono", "ui-monospace", "monospace"]
259
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  # Create the Gradio interface
262
+ with gr.Blocks(theme=theme, css="""
263
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
264
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono&display=swap');
265
+
266
+ .gradio-container {
267
+ font-family: 'Open Sans', sans-serif !important;
268
+ }
269
+
270
+ .gr-box {
271
+ border-radius: 8px !important;
272
+ border: 1px solid #e5e7eb !important;
273
+ }
274
+
275
+ .gr-button {
276
+ font-family: 'Open Sans', sans-serif !important;
277
+ font-weight: 600 !important;
278
+ }
279
+
280
+ .gr-input {
281
+ font-family: 'Open Sans', sans-serif !important;
282
+ }
283
+
284
+ .monospace {
285
+ font-family: 'IBM Plex Mono', monospace !important;
286
+ }
287
+ """) as iface:
288
+ gr.Markdown("# llms.txt Generator")
289
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
290
+
291
+ with gr.Row():
292
+ url_input = gr.Textbox(
293
  label="Website URL",
294
  placeholder="Enter the website URL (e.g., example.com or https://example.com)",
295
  info="The URL will be automatically prefixed with https:// if no protocol is specified."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  )
297
+
298
+ with gr.Row():
299
+ with gr.Column():
300
+ depth_input = gr.Slider(
301
+ minimum=1,
302
+ maximum=5,
303
+ value=3,
304
+ step=1,
305
+ label="Maximum Crawl Depth",
306
+ info="Higher values will result in more thorough but slower crawling"
307
+ )
308
+ with gr.Column():
309
+ pages_input = gr.Slider(
310
+ minimum=10,
311
+ maximum=100,
312
+ value=50,
313
+ step=10,
314
+ label="Maximum Pages to Crawl",
315
+ info="Higher values will result in more comprehensive but slower results"
316
+ )
317
+
318
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
319
+
320
+ output = gr.Textbox(
321
+ label="Generated llms.txt Content",
322
+ lines=20,
323
+ max_lines=30,
324
+ show_copy_button=True,
325
+ container=False,
326
+ scale=2
327
+ )
328
+
329
+ status = gr.Textbox(label="Status")
330
+
331
+ generate_btn.click(
332
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
333
+ inputs=[url_input, depth_input, pages_input],
334
+ outputs=[output, status]
335
+ )
336
 
337
  # Launch the app
338
  if __name__ == "__main__":