Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 22 days ago

Commit

4124a56

•

1 Parent(s): 26fd883

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -87

app.py CHANGED Viewed

@@ -34,16 +34,40 @@ class WebsiteCrawler:
         # Normalize unicode characters
         text = unicodedata.normalize('NFKD', text)
         # Replace special quotes and dashes with standard characters
-        # Replace fancy quotes and dashes with standard ones
         text = text.replace('\u201c', '"').replace('\u201d', '"')  # smart quotes
         text = text.replace('\u2018', "'").replace('\u2019', "'")  # smart single quotes
         text = text.replace('\u2013', '-').replace('\u2014', '-')  # en and em dashes
         # Remove any remaining non-ASCII characters
         text = text.encode('ascii', 'ignore').decode('ascii')
-        # Clean up extra whitespace
         text = ' '.join(text.split())
         return text
     def is_valid_url(self, url, base_domain):
         """Check if URL is valid and belongs to the same domain"""
         try:
@@ -76,42 +100,41 @@ class WebsiteCrawler:
             'category': 'Optional'
         }
-        # Title extraction with normalization
         title = (
             soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
             soup.find('title').text if soup.find('title') else
             soup.find('h1').text if soup.find('h1') else
             url.split('/')[-1]
         )
-        metadata['title'] = self.normalize_text(title)
-        # Description extraction with normalization
         description = (
             soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
             soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
             ""
         )
-        metadata['description'] = self.normalize_text(description)
-        # Calculate importance based on various factors
-        importance = 0
-        if 'docs' in url.lower() or 'documentation' in url.lower():
-            importance += 5
             metadata['category'] = 'Docs'
-        if 'api' in url.lower():
-            importance += 4
             metadata['category'] = 'API'
-        if 'guide' in url.lower() or 'tutorial' in url.lower():
-            importance += 3
             metadata['category'] = 'Guides'
-        if 'example' in url.lower():
-            importance += 2
             metadata['category'] = 'Examples'
-        if 'blog' in url.lower():
-            importance += 1
             metadata['category'] = 'Blog'
-        metadata['importance'] = importance
         return metadata
     async def crawl_page(self, url, depth, base_domain):
@@ -121,7 +144,7 @@ class WebsiteCrawler:
         try:
             response = requests.get(url, headers=self.headers, timeout=self.timeout)
-            response.encoding = 'utf-8'  # Explicitly set encoding
             response.raise_for_status()
             self.visited_urls.add(url)
@@ -173,6 +196,9 @@ class WebsiteCrawler:
             reverse=True
         )
         # Group URLs by category
         categorized_urls = defaultdict(list)
         for url, metadata in sorted_urls:
@@ -182,24 +208,22 @@ class WebsiteCrawler:
         content = []
         # Add main title and description
-        if sorted_urls:
-            main_metadata = sorted_urls[0][1]
-            content.append(f"# {main_metadata['title']}\n")
-            content.append(f"> {main_metadata['description']}\n")
         # Add categorized sections
         priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
         for category in priority_order:
             if category in categorized_urls:
-                content.append(f"\n## {category}\n")
                 for url, metadata in categorized_urls[category]:
-                    title = metadata['title']
-                    desc = metadata['description']
-                    if desc:
-                        content.append(f"- [{title}]({url}): {desc[:100]}...\n")
                     else:
-                        content.append(f"- [{title}]({url})\n")
         return "\n".join(content)
@@ -223,73 +247,92 @@ async def process_url(url, max_depth, max_pages):
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
-        return content, f"Successfully crawled {len(crawler.visited_urls)} pages. You can now copy the generated content."
     except Exception as e:
         return "", f"Error: {str(e)}"
-# Create the Gradio interface with custom CSS for Open Sans font
-css = """
-@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
-body, .gradio-container {
-    font-family: 'Open Sans', sans-serif !important;
-}
-.gr-box {
-    border-radius: 8px !important;
-    border: 1px solid #e5e7eb !important;
-}
-.gr-button {
-    font-family: 'Open Sans', sans-serif !important;
-    font-weight: 600 !important;
-}
-.gr-input {
-    font-family: 'Open Sans', sans-serif !important;
-}
-"""
 # Create the Gradio interface
-iface = gr.Interface(
-    fn=lambda url, max_depth, max_pages: asyncio.run(process_url(url, max_depth, max_pages)),
-    inputs=[
-        gr.Textbox(
             label="Website URL",
             placeholder="Enter the website URL (e.g., example.com or https://example.com)",
             info="The URL will be automatically prefixed with https:// if no protocol is specified."
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=5,
-            value=3,
-            step=1,
-            label="Maximum Crawl Depth",
-            info="Higher values will result in more thorough but slower crawling"
-        ),
-        gr.Slider(
-            minimum=10,
-            maximum=100,
-            value=50,
-            step=10,
-            label="Maximum Pages to Crawl",
-            info="Higher values will result in more comprehensive but slower results"
         )
-    ],
-    outputs=[
-        gr.Textbox(
-            label="Generated llms.txt Content",
-            lines=20,
-            info="Copy this content to create your llms.txt file"
-        ),
-        gr.Textbox(label="Status")
-    ],
-    title="llms.txt Generator",
-    description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
-    theme=gr.themes.Soft(),
-    css=css
-)
 # Launch the app
 if __name__ == "__main__":

         # Normalize unicode characters
         text = unicodedata.normalize('NFKD', text)
         # Replace special quotes and dashes with standard characters
         text = text.replace('\u201c', '"').replace('\u201d', '"')  # smart quotes
         text = text.replace('\u2018', "'").replace('\u2019', "'")  # smart single quotes
         text = text.replace('\u2013', '-').replace('\u2014', '-')  # en and em dashes
         # Remove any remaining non-ASCII characters
         text = text.encode('ascii', 'ignore').decode('ascii')
+        # Clean up extra whitespace and ensure proper sentence spacing
         text = ' '.join(text.split())
         return text
+    def clean_title(self, title):
+        """Clean and format titles"""
+        title = self.normalize_text(title)
+        # Remove common suffixes
+        title = re.sub(r'\s*\|\s*.*$', '', title)  # Remove pipe and everything after
+        title = re.sub(r'\s*-\s*.*$', '', title)   # Remove dash and everything after
+        title = title.strip()
+        return title
+    def clean_description(self, desc):
+        """Clean and format descriptions"""
+        if not desc:
+            return ""
+        desc = self.normalize_text(desc)
+        # Find the last complete sentence
+        sentences = re.split(r'(?<=[.!?])\s+', desc)
+        if sentences:
+            # Take up to two complete sentences
+            cleaned_desc = ' '.join(sentences[:2]).strip()
+            # Ensure it ends with proper punctuation
+            if not cleaned_desc[-1] in '.!?':
+                cleaned_desc += '.'
+            return cleaned_desc
+        return desc
     def is_valid_url(self, url, base_domain):
         """Check if URL is valid and belongs to the same domain"""
         try:
             'category': 'Optional'
         }
+        # Title extraction with cleaning
         title = (
             soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
             soup.find('title').text if soup.find('title') else
             soup.find('h1').text if soup.find('h1') else
             url.split('/')[-1]
         )
+        metadata['title'] = self.clean_title(title)
+        # Description extraction with cleaning
         description = (
             soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
             soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
             ""
         )
+        metadata['description'] = self.clean_description(description)
+        # Calculate importance and category
+        url_lower = url.lower()
+        if 'docs' in url_lower or 'documentation' in url_lower:
+            metadata['importance'] = 5
             metadata['category'] = 'Docs'
+        elif 'api' in url_lower:
+            metadata['importance'] = 4
             metadata['category'] = 'API'
+        elif 'guide' in url_lower or 'tutorial' in url_lower:
+            metadata['importance'] = 3
             metadata['category'] = 'Guides'
+        elif 'example' in url_lower:
+            metadata['importance'] = 2
             metadata['category'] = 'Examples'
+        elif 'blog' in url_lower:
+            metadata['importance'] = 1
             metadata['category'] = 'Blog'
         return metadata
     async def crawl_page(self, url, depth, base_domain):
         try:
             response = requests.get(url, headers=self.headers, timeout=self.timeout)
+            response.encoding = 'utf-8'
             response.raise_for_status()
             self.visited_urls.add(url)
             reverse=True
         )
+        if not sorted_urls:
+            return "No content was found to generate llms.txt"
         # Group URLs by category
         categorized_urls = defaultdict(list)
         for url, metadata in sorted_urls:
         content = []
         # Add main title and description
+        main_metadata = sorted_urls[0][1]
+        content.append(f"# {main_metadata['title']}")
+        if main_metadata['description']:
+            content.append(f"\n> {main_metadata['description']}")
         # Add categorized sections
         priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
         for category in priority_order:
             if category in categorized_urls:
+                content.append(f"\n## {category}")
                 for url, metadata in categorized_urls[category]:
+                    if metadata['description']:
+                        content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
                     else:
+                        content.append(f"\n- [{metadata['title']}]({url})")
         return "\n".join(content)
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
+        return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
         return "", f"Error: {str(e)}"
+# Create custom theme with Open Sans
+theme = gr.themes.Soft().set(
+    font=["Open Sans", "ui-sans-serif", "system-ui", "sans-serif"],
+    font_mono=["IBM Plex Mono", "ui-monospace", "monospace"]
+)
 # Create the Gradio interface
+with gr.Blocks(theme=theme, css="""
+    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
+    @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono&display=swap');
+    .gradio-container {
+        font-family: 'Open Sans', sans-serif !important;
+    }
+    .gr-box {
+        border-radius: 8px !important;
+        border: 1px solid #e5e7eb !important;
+    }
+    .gr-button {
+        font-family: 'Open Sans', sans-serif !important;
+        font-weight: 600 !important;
+    }
+    .gr-input {
+        font-family: 'Open Sans', sans-serif !important;
+    }
+    .monospace {
+        font-family: 'IBM Plex Mono', monospace !important;
+    }
+""") as iface:
+    gr.Markdown("# llms.txt Generator")
+    gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
+    with gr.Row():
+        url_input = gr.Textbox(
             label="Website URL",
             placeholder="Enter the website URL (e.g., example.com or https://example.com)",
             info="The URL will be automatically prefixed with https:// if no protocol is specified."
         )
+    with gr.Row():
+        with gr.Column():
+            depth_input = gr.Slider(
+                minimum=1,
+                maximum=5,
+                value=3,
+                step=1,
+                label="Maximum Crawl Depth",
+                info="Higher values will result in more thorough but slower crawling"
+            )
+        with gr.Column():
+            pages_input = gr.Slider(
+                minimum=10,
+                maximum=100,
+                value=50,
+                step=10,
+                label="Maximum Pages to Crawl",
+                info="Higher values will result in more comprehensive but slower results"
+            )
+    generate_btn = gr.Button("Generate llms.txt", variant="primary")
+    output = gr.Textbox(
+        label="Generated llms.txt Content",
+        lines=20,
+        max_lines=30,
+        show_copy_button=True,
+        container=False,
+        scale=2
+    )
+    status = gr.Textbox(label="Status")
+    generate_btn.click(
+        fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
+        inputs=[url_input, depth_input, pages_input],
+        outputs=[output, status]
+    )
 # Launch the app
 if __name__ == "__main__":