Spaces:

WordLift
/

create-llms-txt

Running

File size: 5,323 Bytes

5e3183d

import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import markdown

def get_website_title(soup):
    """Extract website title from meta tags or title tag"""
    # Try meta title first
    meta_title = soup.find('meta', property='og:title')
    if meta_title:
        return meta_title['content']
    
    # Try regular title tag
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.text.strip()
    
    # Fallback to h1
    h1_tag = soup.find('h1')
    if h1_tag:
        return h1_tag.text.strip()
    
    return "Website Title"

def get_website_description(soup):
    """Extract website description from meta tags"""
    # Try meta description
    meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
    if meta_desc:
        return meta_desc.get('content', '')
    
    # Fallback to first paragraph
    first_p = soup.find('p')
    if first_p:
        return first_p.text.strip()
    
    return "Website description"

def get_important_links(soup, base_url):
    """Extract important links from the website"""
    links = []
    seen_urls = set()
    
    # Look for navigation links
    nav_elements = soup.find_all(['nav', 'header'])
    for nav in nav_elements:
        for a in nav.find_all('a', href=True):
            url = urljoin(base_url, a['href'])
            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
                text = a.text.strip()
                if text and len(text) > 1:  # Avoid empty or single-character links
                    links.append({
                        'title': text,
                        'url': url,
                        'section': 'Docs'
                    })
                    seen_urls.add(url)
    
    # Look for footer links
    footer = soup.find('footer')
    if footer:
        for a in footer.find_all('a', href=True):
            url = urljoin(base_url, a['href'])
            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
                text = a.text.strip()
                if text and len(text) > 1:
                    links.append({
                        'title': text,
                        'url': url,
                        'section': 'Optional'
                    })
                    seen_urls.add(url)
    
    return links

def generate_llms_txt(url):
    try:
        # Fetch the webpage
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get base components
        title = get_website_title(soup)
        description = get_website_description(soup)
        links = get_important_links(soup, url)
        
        # Generate llms.txt content
        content = [
            f"# {title}\n",
            f"> {description}\n",
            "## Docs\n"
        ]
        
        # Add documentation links
        doc_links = [link for link in links if link['section'] == 'Docs']
        for link in doc_links:
            content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
        
        # Add optional links if present
        optional_links = [link for link in links if link['section'] == 'Optional']
        if optional_links:
            content.append("\n## Optional\n")
            for link in optional_links:
                content.append(f"- [{link['title']}]({link['url']})\n")
        
        # Join all content
        llms_txt_content = "\n".join(content)
        
        return llms_txt_content
    except Exception as e:
        return f"Error generating llms.txt: {str(e)}"

def save_llms_txt(content, save_path="llms.txt"):
    """Save the generated content to a file"""
    try:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(content)
        return f"Successfully saved to {save_path}"
    except Exception as e:
        return f"Error saving file: {str(e)}"

# Create Gradio interface
def process_url(url, save_to_file=False):
    content = generate_llms_txt(url)
    if save_to_file:
        save_message = save_llms_txt(content)
        return content, save_message
    return content, "File not saved (checkbox not selected)"

# Create the Gradio interface
iface = gr.Interface(
    fn=process_url,
    inputs=[
        gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
        gr.Checkbox(label="Save to file", value=False)
    ],
    outputs=[
        gr.Textbox(label="Generated llms.txt Content", lines=10),
        gr.Textbox(label="Status")
    ],
    title="llms.txt Generator",
    description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
    examples=[
        ["https://example.com", False],
        ["https://docs.python.org", True]
    ],
    theme=gr.themes.Soft()
)

# Launch the app
if __name__ == "__main__":
    iface.launch()