cyberandy commited on
Commit
f21d84e
·
verified ·
1 Parent(s): e81ffaf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -16
app.py CHANGED
@@ -150,39 +150,61 @@ class WebsiteCrawler:
150
  if not self.url_metadata:
151
  return "No content was found to generate llms.txt"
152
 
153
- # Sort and filter URLs
154
- sorted_urls = sorted(
 
 
 
155
  self.url_metadata.items(),
156
  key=lambda x: (x[1]['importance'], x[0]),
157
  reverse=True
158
- )
 
 
 
 
 
 
159
 
160
  # Generate content
161
  content = []
162
- main_metadata = sorted_urls[0][1]
163
- content.append(f"# {main_metadata['title']}")
164
- if main_metadata['description']:
165
- content.append(f"\n> {main_metadata['description']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  # Group by category
168
  categories = defaultdict(list)
169
- seen_titles = set()
170
-
171
  for url, metadata in sorted_urls:
172
- title = metadata['title']
173
- if title not in seen_titles:
174
  categories[metadata['category']].append((url, metadata))
175
- seen_titles.add(title)
176
 
177
  # Add sections
178
  for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
179
  if category in categories:
180
- content.append(f"\n## {category}")
181
  for url, metadata in categories[category]:
182
- if metadata['description']:
183
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
 
 
184
  else:
185
- content.append(f"\n- [{metadata['title']}]({url})")
186
 
187
  return "\n".join(content)
188
 
 
150
  if not self.url_metadata:
151
  return "No content was found to generate llms.txt"
152
 
153
+ # Sort URLs by importance and remove duplicates
154
+ sorted_urls = []
155
+ seen_titles = set()
156
+
157
+ for url, metadata in sorted(
158
  self.url_metadata.items(),
159
  key=lambda x: (x[1]['importance'], x[0]),
160
  reverse=True
161
+ ):
162
+ if metadata['title'] not in seen_titles:
163
+ sorted_urls.append((url, metadata))
164
+ seen_titles.add(metadata['title'])
165
+
166
+ if not sorted_urls:
167
+ return "No valid content was found"
168
 
169
  # Generate content
170
  content = []
171
+
172
+ # Find the best title for the main header
173
+ main_titles = [
174
+ metadata['title'] for _, metadata in sorted_urls
175
+ if 'overview' in metadata['title'].lower() or
176
+ 'welcome' in metadata['title'].lower() or
177
+ 'introduction' in metadata['title'].lower()
178
+ ]
179
+
180
+ main_title = main_titles[0] if main_titles else sorted_urls[0][1]['title']
181
+ content.append(f"# {main_title}")
182
+
183
+ # Find a good description for the blockquote
184
+ descriptions = [
185
+ metadata['description'] for _, metadata in sorted_urls
186
+ if metadata['description'] and len(metadata['description']) > 20
187
+ ]
188
+ if descriptions:
189
+ content.append(f"\n> {descriptions[0]}")
190
 
191
  # Group by category
192
  categories = defaultdict(list)
 
 
193
  for url, metadata in sorted_urls:
194
+ if metadata['title'] and url: # Ensure we have both title and URL
 
195
  categories[metadata['category']].append((url, metadata))
 
196
 
197
  # Add sections
198
  for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
199
  if category in categories:
200
+ content.append(f"\n## {category}\n")
201
  for url, metadata in categories[category]:
202
+ title = metadata['title'].strip()
203
+ desc = metadata['description'].strip() if metadata['description'] else ""
204
+ if desc:
205
+ content.append(f"- [{title}]({url}): {desc}")
206
  else:
207
+ content.append(f"- [{title}]({url})")
208
 
209
  return "\n".join(content)
210