File size: 5,323 Bytes
5e3183d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import markdown

def get_website_title(soup):
    """Extract website title from meta tags or title tag"""
    # Try meta title first
    meta_title = soup.find('meta', property='og:title')
    if meta_title:
        return meta_title['content']
    
    # Try regular title tag
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.text.strip()
    
    # Fallback to h1
    h1_tag = soup.find('h1')
    if h1_tag:
        return h1_tag.text.strip()
    
    return "Website Title"

def get_website_description(soup):
    """Extract website description from meta tags"""
    # Try meta description
    meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
    if meta_desc:
        return meta_desc.get('content', '')
    
    # Fallback to first paragraph
    first_p = soup.find('p')
    if first_p:
        return first_p.text.strip()
    
    return "Website description"

def get_important_links(soup, base_url):
    """Extract important links from the website"""
    links = []
    seen_urls = set()
    
    # Look for navigation links
    nav_elements = soup.find_all(['nav', 'header'])
    for nav in nav_elements:
        for a in nav.find_all('a', href=True):
            url = urljoin(base_url, a['href'])
            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
                text = a.text.strip()
                if text and len(text) > 1:  # Avoid empty or single-character links
                    links.append({
                        'title': text,
                        'url': url,
                        'section': 'Docs'
                    })
                    seen_urls.add(url)
    
    # Look for footer links
    footer = soup.find('footer')
    if footer:
        for a in footer.find_all('a', href=True):
            url = urljoin(base_url, a['href'])
            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
                text = a.text.strip()
                if text and len(text) > 1:
                    links.append({
                        'title': text,
                        'url': url,
                        'section': 'Optional'
                    })
                    seen_urls.add(url)
    
    return links

def generate_llms_txt(url):
    try:
        # Fetch the webpage
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get base components
        title = get_website_title(soup)
        description = get_website_description(soup)
        links = get_important_links(soup, url)
        
        # Generate llms.txt content
        content = [
            f"# {title}\n",
            f"> {description}\n",
            "## Docs\n"
        ]
        
        # Add documentation links
        doc_links = [link for link in links if link['section'] == 'Docs']
        for link in doc_links:
            content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
        
        # Add optional links if present
        optional_links = [link for link in links if link['section'] == 'Optional']
        if optional_links:
            content.append("\n## Optional\n")
            for link in optional_links:
                content.append(f"- [{link['title']}]({link['url']})\n")
        
        # Join all content
        llms_txt_content = "\n".join(content)
        
        return llms_txt_content
    except Exception as e:
        return f"Error generating llms.txt: {str(e)}"

def save_llms_txt(content, save_path="llms.txt"):
    """Save the generated content to a file"""
    try:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(content)
        return f"Successfully saved to {save_path}"
    except Exception as e:
        return f"Error saving file: {str(e)}"

# Create Gradio interface
def process_url(url, save_to_file=False):
    content = generate_llms_txt(url)
    if save_to_file:
        save_message = save_llms_txt(content)
        return content, save_message
    return content, "File not saved (checkbox not selected)"

# Create the Gradio interface
iface = gr.Interface(
    fn=process_url,
    inputs=[
        gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
        gr.Checkbox(label="Save to file", value=False)
    ],
    outputs=[
        gr.Textbox(label="Generated llms.txt Content", lines=10),
        gr.Textbox(label="Status")
    ],
    title="llms.txt Generator",
    description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
    examples=[
        ["https://example.com", False],
        ["https://docs.python.org", True]
    ],
    theme=gr.themes.Soft()
)

# Launch the app
if __name__ == "__main__":
    iface.launch()