Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -112,11 +112,11 @@ class WebsiteCrawler:
|
|
112 |
"""Generate llms.txt content"""
|
113 |
if not self.url_metadata:
|
114 |
return "No content was found to generate llms.txt"
|
115 |
-
|
116 |
# Sort URLs by importance and remove duplicates
|
117 |
sorted_urls = []
|
118 |
seen_titles = set()
|
119 |
-
|
120 |
for url, metadata in sorted(
|
121 |
self.url_metadata.items(),
|
122 |
key=lambda x: (x[1]["importance"], x[0]),
|
@@ -125,17 +125,17 @@ class WebsiteCrawler:
|
|
125 |
if metadata["title"] not in seen_titles:
|
126 |
sorted_urls.append((url, metadata))
|
127 |
seen_titles.add(metadata["title"])
|
128 |
-
|
129 |
if not sorted_urls:
|
130 |
return "No valid content was found"
|
131 |
-
|
132 |
# Generate content
|
133 |
content = []
|
134 |
-
|
135 |
# Use homepage metadata for main title and description
|
136 |
main_title = self.homepage_metadata.get("site_name", "Welcome")
|
137 |
homepage_description = self.homepage_metadata.get("description")
|
138 |
-
|
139 |
content.append(f"# {main_title}")
|
140 |
if homepage_description:
|
141 |
content.append(f"\n> {homepage_description}")
|
@@ -146,10 +146,30 @@ class WebsiteCrawler:
|
|
146 |
if desc and len(desc) > 20 and "null" not in desc.lower():
|
147 |
content.append(f"\n> {desc}")
|
148 |
break
|
149 |
-
|
150 |
-
#
|
151 |
-
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
return "\n".join(content)
|
154 |
|
155 |
def clean_text(self, text, is_title=False):
|
|
|
112 |
"""Generate llms.txt content"""
|
113 |
if not self.url_metadata:
|
114 |
return "No content was found to generate llms.txt"
|
115 |
+
|
116 |
# Sort URLs by importance and remove duplicates
|
117 |
sorted_urls = []
|
118 |
seen_titles = set()
|
119 |
+
|
120 |
for url, metadata in sorted(
|
121 |
self.url_metadata.items(),
|
122 |
key=lambda x: (x[1]["importance"], x[0]),
|
|
|
125 |
if metadata["title"] not in seen_titles:
|
126 |
sorted_urls.append((url, metadata))
|
127 |
seen_titles.add(metadata["title"])
|
128 |
+
|
129 |
if not sorted_urls:
|
130 |
return "No valid content was found"
|
131 |
+
|
132 |
# Generate content
|
133 |
content = []
|
134 |
+
|
135 |
# Use homepage metadata for main title and description
|
136 |
main_title = self.homepage_metadata.get("site_name", "Welcome")
|
137 |
homepage_description = self.homepage_metadata.get("description")
|
138 |
+
|
139 |
content.append(f"# {main_title}")
|
140 |
if homepage_description:
|
141 |
content.append(f"\n> {homepage_description}")
|
|
|
146 |
if desc and len(desc) > 20 and "null" not in desc.lower():
|
147 |
content.append(f"\n> {desc}")
|
148 |
break
|
149 |
+
|
150 |
+
# Group by category
|
151 |
+
categories = defaultdict(list)
|
152 |
+
for url, metadata in sorted_urls:
|
153 |
+
if metadata["title"] and url:
|
154 |
+
categories[metadata["category"]].append((url, metadata))
|
155 |
+
|
156 |
+
# Add sections
|
157 |
+
for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
|
158 |
+
if category in categories:
|
159 |
+
content.append(f"\n## {category}")
|
160 |
+
|
161 |
+
# Add links without extra newlines
|
162 |
+
links = []
|
163 |
+
for url, metadata in categories[category]:
|
164 |
+
title = metadata["title"].strip()
|
165 |
+
desc = self.clean_description(metadata["description"])
|
166 |
+
if desc:
|
167 |
+
links.append(f"- [{title}]({url}): {desc}")
|
168 |
+
else:
|
169 |
+
links.append(f"- [{title}]({url})")
|
170 |
+
|
171 |
+
content.append("\n".join(links))
|
172 |
+
|
173 |
return "\n".join(content)
|
174 |
|
175 |
def clean_text(self, text, is_title=False):
|