Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

66fe9ad

verified ·

1 Parent(s): e9f1fb9

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -10

app.py CHANGED Viewed

@@ -112,11 +112,11 @@ class WebsiteCrawler:
         """Generate llms.txt content"""
         if not self.url_metadata:
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
@@ -125,17 +125,17 @@ class WebsiteCrawler:
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
@@ -146,10 +146,30 @@ class WebsiteCrawler:
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
-        # Rest of the generation remains the same...
-        # [Previous category grouping and link generation code]
         return "\n".join(content)
         def clean_text(self, text, is_title=False):

         """Generate llms.txt content"""
         if not self.url_metadata:
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
+        # Group by category
+        categories = defaultdict(list)
+        for url, metadata in sorted_urls:
+            if metadata["title"] and url:
+                categories[metadata["category"]].append((url, metadata))
+        # Add sections
+        for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
+            if category in categories:
+                content.append(f"\n## {category}")
+                # Add links without extra newlines
+                links = []
+                for url, metadata in categories[category]:
+                    title = metadata["title"].strip()
+                    desc = self.clean_description(metadata["description"])
+                    if desc:
+                        links.append(f"- [{title}]({url}): {desc}")
+                    else:
+                        links.append(f"- [{title}]({url})")
+                content.append("\n".join(links))
         return "\n".join(content)
         def clean_text(self, text, is_title=False):