cyberandy commited on
Commit
66fe9ad
·
verified ·
1 Parent(s): e9f1fb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -10
app.py CHANGED
@@ -112,11 +112,11 @@ class WebsiteCrawler:
112
  """Generate llms.txt content"""
113
  if not self.url_metadata:
114
  return "No content was found to generate llms.txt"
115
-
116
  # Sort URLs by importance and remove duplicates
117
  sorted_urls = []
118
  seen_titles = set()
119
-
120
  for url, metadata in sorted(
121
  self.url_metadata.items(),
122
  key=lambda x: (x[1]["importance"], x[0]),
@@ -125,17 +125,17 @@ class WebsiteCrawler:
125
  if metadata["title"] not in seen_titles:
126
  sorted_urls.append((url, metadata))
127
  seen_titles.add(metadata["title"])
128
-
129
  if not sorted_urls:
130
  return "No valid content was found"
131
-
132
  # Generate content
133
  content = []
134
-
135
  # Use homepage metadata for main title and description
136
  main_title = self.homepage_metadata.get("site_name", "Welcome")
137
  homepage_description = self.homepage_metadata.get("description")
138
-
139
  content.append(f"# {main_title}")
140
  if homepage_description:
141
  content.append(f"\n> {homepage_description}")
@@ -146,10 +146,30 @@ class WebsiteCrawler:
146
  if desc and len(desc) > 20 and "null" not in desc.lower():
147
  content.append(f"\n> {desc}")
148
  break
149
-
150
- # Rest of the generation remains the same...
151
- # [Previous category grouping and link generation code]
152
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  return "\n".join(content)
154
 
155
  def clean_text(self, text, is_title=False):
 
112
  """Generate llms.txt content"""
113
  if not self.url_metadata:
114
  return "No content was found to generate llms.txt"
115
+
116
  # Sort URLs by importance and remove duplicates
117
  sorted_urls = []
118
  seen_titles = set()
119
+
120
  for url, metadata in sorted(
121
  self.url_metadata.items(),
122
  key=lambda x: (x[1]["importance"], x[0]),
 
125
  if metadata["title"] not in seen_titles:
126
  sorted_urls.append((url, metadata))
127
  seen_titles.add(metadata["title"])
128
+
129
  if not sorted_urls:
130
  return "No valid content was found"
131
+
132
  # Generate content
133
  content = []
134
+
135
  # Use homepage metadata for main title and description
136
  main_title = self.homepage_metadata.get("site_name", "Welcome")
137
  homepage_description = self.homepage_metadata.get("description")
138
+
139
  content.append(f"# {main_title}")
140
  if homepage_description:
141
  content.append(f"\n> {homepage_description}")
 
146
  if desc and len(desc) > 20 and "null" not in desc.lower():
147
  content.append(f"\n> {desc}")
148
  break
149
+
150
+ # Group by category
151
+ categories = defaultdict(list)
152
+ for url, metadata in sorted_urls:
153
+ if metadata["title"] and url:
154
+ categories[metadata["category"]].append((url, metadata))
155
+
156
+ # Add sections
157
+ for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
158
+ if category in categories:
159
+ content.append(f"\n## {category}")
160
+
161
+ # Add links without extra newlines
162
+ links = []
163
+ for url, metadata in categories[category]:
164
+ title = metadata["title"].strip()
165
+ desc = self.clean_description(metadata["description"])
166
+ if desc:
167
+ links.append(f"- [{title}]({url}): {desc}")
168
+ else:
169
+ links.append(f"- [{title}]({url})")
170
+
171
+ content.append("\n".join(links))
172
+
173
  return "\n".join(content)
174
 
175
  def clean_text(self, text, is_title=False):