Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -145,6 +145,17 @@ class WebsiteCrawler:
|
|
145 |
seen.add(link)
|
146 |
queue.append((link, depth + 1))
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
def generate_llms_txt(self):
|
149 |
"""Generate llms.txt content"""
|
150 |
if not self.url_metadata:
|
@@ -169,44 +180,45 @@ class WebsiteCrawler:
|
|
169 |
# Generate content
|
170 |
content = []
|
171 |
|
172 |
-
# Find the best title for the main header
|
173 |
-
|
174 |
-
metadata['title'] for _, metadata in sorted_urls
|
175 |
-
if 'overview' in metadata['title'].lower() or
|
176 |
-
'welcome' in metadata['title'].lower() or
|
177 |
-
'introduction' in metadata['title'].lower()
|
178 |
-
]
|
179 |
|
180 |
-
main_title = main_titles[0] if main_titles else sorted_urls[0][1]['title']
|
181 |
-
content.append(f"# {main_title}")
|
182 |
-
|
183 |
# Find a good description for the blockquote
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
190 |
|
191 |
# Group by category
|
192 |
categories = defaultdict(list)
|
193 |
for url, metadata in sorted_urls:
|
194 |
-
if metadata['title'] and url:
|
195 |
categories[metadata['category']].append((url, metadata))
|
196 |
|
197 |
# Add sections
|
198 |
for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
|
199 |
if category in categories:
|
200 |
-
content.append(f"\n## {category}
|
|
|
|
|
|
|
201 |
for url, metadata in categories[category]:
|
202 |
title = metadata['title'].strip()
|
203 |
-
desc =
|
204 |
if desc:
|
205 |
-
|
206 |
else:
|
207 |
-
|
|
|
|
|
208 |
|
209 |
-
return
|
210 |
|
211 |
async def process_url(url, max_depth, max_pages):
|
212 |
"""Process URL and generate llms.txt"""
|
|
|
145 |
seen.add(link)
|
146 |
queue.append((link, depth + 1))
|
147 |
|
148 |
+
def clean_description(self, desc):
|
149 |
+
"""Clean description text"""
|
150 |
+
if not desc:
|
151 |
+
return ""
|
152 |
+
# Remove leading dashes, hyphens, or colons
|
153 |
+
desc = re.sub(r'^[-:\s]+', '', desc)
|
154 |
+
# Remove any strings that are just "Editors", "APIs", etc.
|
155 |
+
if len(desc.split()) <= 1:
|
156 |
+
return ""
|
157 |
+
return desc.strip()
|
158 |
+
|
159 |
def generate_llms_txt(self):
|
160 |
"""Generate llms.txt content"""
|
161 |
if not self.url_metadata:
|
|
|
180 |
# Generate content
|
181 |
content = []
|
182 |
|
183 |
+
# Find the best title for the main header (prefer "Welcome" or "Overview")
|
184 |
+
main_title = "Welcome" # Default to Welcome
|
|
|
|
|
|
|
|
|
|
|
185 |
|
|
|
|
|
|
|
186 |
# Find a good description for the blockquote
|
187 |
+
best_description = None
|
188 |
+
for _, metadata in sorted_urls:
|
189 |
+
desc = self.clean_description(metadata['description'])
|
190 |
+
if desc and len(desc) > 20 and "null" not in desc.lower():
|
191 |
+
best_description = desc
|
192 |
+
break
|
193 |
+
|
194 |
+
content.append(f"# {main_title}")
|
195 |
+
if best_description:
|
196 |
+
content.append(f"\n> {best_description}")
|
197 |
|
198 |
# Group by category
|
199 |
categories = defaultdict(list)
|
200 |
for url, metadata in sorted_urls:
|
201 |
+
if metadata['title'] and url:
|
202 |
categories[metadata['category']].append((url, metadata))
|
203 |
|
204 |
# Add sections
|
205 |
for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
|
206 |
if category in categories:
|
207 |
+
content.append(f"\n## {category}")
|
208 |
+
|
209 |
+
# Add links without extra newlines
|
210 |
+
links = []
|
211 |
for url, metadata in categories[category]:
|
212 |
title = metadata['title'].strip()
|
213 |
+
desc = self.clean_description(metadata['description'])
|
214 |
if desc:
|
215 |
+
links.append(f"- [{title}]({url}): {desc}")
|
216 |
else:
|
217 |
+
links.append(f"- [{title}]({url})")
|
218 |
+
|
219 |
+
content.append('\n'.join(links))
|
220 |
|
221 |
+
return '\n'.join(content)
|
222 |
|
223 |
async def process_url(url, max_depth, max_pages):
|
224 |
"""Process URL and generate llms.txt"""
|