cyberandy commited on
Commit
8dd9e80
·
verified ·
1 Parent(s): f21d84e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -22
app.py CHANGED
@@ -145,6 +145,17 @@ class WebsiteCrawler:
145
  seen.add(link)
146
  queue.append((link, depth + 1))
147
 
 
 
 
 
 
 
 
 
 
 
 
148
  def generate_llms_txt(self):
149
  """Generate llms.txt content"""
150
  if not self.url_metadata:
@@ -169,44 +180,45 @@ class WebsiteCrawler:
169
  # Generate content
170
  content = []
171
 
172
- # Find the best title for the main header
173
- main_titles = [
174
- metadata['title'] for _, metadata in sorted_urls
175
- if 'overview' in metadata['title'].lower() or
176
- 'welcome' in metadata['title'].lower() or
177
- 'introduction' in metadata['title'].lower()
178
- ]
179
 
180
- main_title = main_titles[0] if main_titles else sorted_urls[0][1]['title']
181
- content.append(f"# {main_title}")
182
-
183
  # Find a good description for the blockquote
184
- descriptions = [
185
- metadata['description'] for _, metadata in sorted_urls
186
- if metadata['description'] and len(metadata['description']) > 20
187
- ]
188
- if descriptions:
189
- content.append(f"\n> {descriptions[0]}")
 
 
 
 
190
 
191
  # Group by category
192
  categories = defaultdict(list)
193
  for url, metadata in sorted_urls:
194
- if metadata['title'] and url: # Ensure we have both title and URL
195
  categories[metadata['category']].append((url, metadata))
196
 
197
  # Add sections
198
  for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
199
  if category in categories:
200
- content.append(f"\n## {category}\n")
 
 
 
201
  for url, metadata in categories[category]:
202
  title = metadata['title'].strip()
203
- desc = metadata['description'].strip() if metadata['description'] else ""
204
  if desc:
205
- content.append(f"- [{title}]({url}): {desc}")
206
  else:
207
- content.append(f"- [{title}]({url})")
 
 
208
 
209
- return "\n".join(content)
210
 
211
  async def process_url(url, max_depth, max_pages):
212
  """Process URL and generate llms.txt"""
 
145
  seen.add(link)
146
  queue.append((link, depth + 1))
147
 
148
+ def clean_description(self, desc):
149
+ """Clean description text"""
150
+ if not desc:
151
+ return ""
152
+ # Remove leading dashes, hyphens, or colons
153
+ desc = re.sub(r'^[-:\s]+', '', desc)
154
+ # Remove any strings that are just "Editors", "APIs", etc.
155
+ if len(desc.split()) <= 1:
156
+ return ""
157
+ return desc.strip()
158
+
159
  def generate_llms_txt(self):
160
  """Generate llms.txt content"""
161
  if not self.url_metadata:
 
180
  # Generate content
181
  content = []
182
 
183
+ # Find the best title for the main header (prefer "Welcome" or "Overview")
184
+ main_title = "Welcome" # Default to Welcome
 
 
 
 
 
185
 
 
 
 
186
  # Find a good description for the blockquote
187
+ best_description = None
188
+ for _, metadata in sorted_urls:
189
+ desc = self.clean_description(metadata['description'])
190
+ if desc and len(desc) > 20 and "null" not in desc.lower():
191
+ best_description = desc
192
+ break
193
+
194
+ content.append(f"# {main_title}")
195
+ if best_description:
196
+ content.append(f"\n> {best_description}")
197
 
198
  # Group by category
199
  categories = defaultdict(list)
200
  for url, metadata in sorted_urls:
201
+ if metadata['title'] and url:
202
  categories[metadata['category']].append((url, metadata))
203
 
204
  # Add sections
205
  for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
206
  if category in categories:
207
+ content.append(f"\n## {category}")
208
+
209
+ # Add links without extra newlines
210
+ links = []
211
  for url, metadata in categories[category]:
212
  title = metadata['title'].strip()
213
+ desc = self.clean_description(metadata['description'])
214
  if desc:
215
+ links.append(f"- [{title}]({url}): {desc}")
216
  else:
217
+ links.append(f"- [{title}]({url})")
218
+
219
+ content.append('\n'.join(links))
220
 
221
+ return '\n'.join(content)
222
 
223
  async def process_url(url, max_depth, max_pages):
224
  """Process URL and generate llms.txt"""