Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 23, 2024

Commit

8dd9e80

verified ·

1 Parent(s): f21d84e

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -22

app.py CHANGED Viewed

@@ -145,6 +145,17 @@ class WebsiteCrawler:
                     seen.add(link)
                     queue.append((link, depth + 1))
     def generate_llms_txt(self):
         """Generate llms.txt content"""
         if not self.url_metadata:
@@ -169,44 +180,45 @@ class WebsiteCrawler:
         # Generate content
         content = []
-        # Find the best title for the main header
-        main_titles = [
-            metadata['title'] for _, metadata in sorted_urls
-            if 'overview' in metadata['title'].lower() or
-               'welcome' in metadata['title'].lower() or
-               'introduction' in metadata['title'].lower()
-        ]
-        main_title = main_titles[0] if main_titles else sorted_urls[0][1]['title']
-        content.append(f"# {main_title}")
         # Find a good description for the blockquote
-        descriptions = [
-            metadata['description'] for _, metadata in sorted_urls
-            if metadata['description'] and len(metadata['description']) > 20
-        ]
-        if descriptions:
-            content.append(f"\n> {descriptions[0]}")
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
-            if metadata['title'] and url:  # Ensure we have both title and URL
                 categories[metadata['category']].append((url, metadata))
         # Add sections
         for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
             if category in categories:
-                content.append(f"\n## {category}\n")
                 for url, metadata in categories[category]:
                     title = metadata['title'].strip()
-                    desc = metadata['description'].strip() if metadata['description'] else ""
                     if desc:
-                        content.append(f"- [{title}]({url}): {desc}")
                     else:
-                        content.append(f"- [{title}]({url})")
-        return "\n".join(content)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""

                     seen.add(link)
                     queue.append((link, depth + 1))
+    def clean_description(self, desc):
+        """Clean description text"""
+        if not desc:
+            return ""
+        # Remove leading dashes, hyphens, or colons
+        desc = re.sub(r'^[-:\s]+', '', desc)
+        # Remove any strings that are just "Editors", "APIs", etc.
+        if len(desc.split()) <= 1:
+            return ""
+        return desc.strip()
     def generate_llms_txt(self):
         """Generate llms.txt content"""
         if not self.url_metadata:
         # Generate content
         content = []
+        # Find the best title for the main header (prefer "Welcome" or "Overview")
+        main_title = "Welcome"  # Default to Welcome
         # Find a good description for the blockquote
+        best_description = None
+        for _, metadata in sorted_urls:
+            desc = self.clean_description(metadata['description'])
+            if desc and len(desc) > 20 and "null" not in desc.lower():
+                best_description = desc
+                break
+        content.append(f"# {main_title}")
+        if best_description:
+            content.append(f"\n> {best_description}")
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
+            if metadata['title'] and url:
                 categories[metadata['category']].append((url, metadata))
         # Add sections
         for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
             if category in categories:
+                content.append(f"\n## {category}")
+                # Add links without extra newlines
+                links = []
                 for url, metadata in categories[category]:
                     title = metadata['title'].strip()
+                    desc = self.clean_description(metadata['description'])
                     if desc:
+                        links.append(f"- [{title}]({url}): {desc}")
                     else:
+                        links.append(f"- [{title}]({url})")
+                content.append('\n'.join(links))
+        return '\n'.join(content)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""