Spaces:
Running
Running
File size: 5,323 Bytes
5e3183d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import markdown
def get_website_title(soup):
"""Extract website title from meta tags or title tag"""
# Try meta title first
meta_title = soup.find('meta', property='og:title')
if meta_title:
return meta_title['content']
# Try regular title tag
title_tag = soup.find('title')
if title_tag:
return title_tag.text.strip()
# Fallback to h1
h1_tag = soup.find('h1')
if h1_tag:
return h1_tag.text.strip()
return "Website Title"
def get_website_description(soup):
"""Extract website description from meta tags"""
# Try meta description
meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
if meta_desc:
return meta_desc.get('content', '')
# Fallback to first paragraph
first_p = soup.find('p')
if first_p:
return first_p.text.strip()
return "Website description"
def get_important_links(soup, base_url):
"""Extract important links from the website"""
links = []
seen_urls = set()
# Look for navigation links
nav_elements = soup.find_all(['nav', 'header'])
for nav in nav_elements:
for a in nav.find_all('a', href=True):
url = urljoin(base_url, a['href'])
if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
text = a.text.strip()
if text and len(text) > 1: # Avoid empty or single-character links
links.append({
'title': text,
'url': url,
'section': 'Docs'
})
seen_urls.add(url)
# Look for footer links
footer = soup.find('footer')
if footer:
for a in footer.find_all('a', href=True):
url = urljoin(base_url, a['href'])
if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
text = a.text.strip()
if text and len(text) > 1:
links.append({
'title': text,
'url': url,
'section': 'Optional'
})
seen_urls.add(url)
return links
def generate_llms_txt(url):
try:
# Fetch the webpage
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Get base components
title = get_website_title(soup)
description = get_website_description(soup)
links = get_important_links(soup, url)
# Generate llms.txt content
content = [
f"# {title}\n",
f"> {description}\n",
"## Docs\n"
]
# Add documentation links
doc_links = [link for link in links if link['section'] == 'Docs']
for link in doc_links:
content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
# Add optional links if present
optional_links = [link for link in links if link['section'] == 'Optional']
if optional_links:
content.append("\n## Optional\n")
for link in optional_links:
content.append(f"- [{link['title']}]({link['url']})\n")
# Join all content
llms_txt_content = "\n".join(content)
return llms_txt_content
except Exception as e:
return f"Error generating llms.txt: {str(e)}"
def save_llms_txt(content, save_path="llms.txt"):
"""Save the generated content to a file"""
try:
with open(save_path, 'w', encoding='utf-8') as f:
f.write(content)
return f"Successfully saved to {save_path}"
except Exception as e:
return f"Error saving file: {str(e)}"
# Create Gradio interface
def process_url(url, save_to_file=False):
content = generate_llms_txt(url)
if save_to_file:
save_message = save_llms_txt(content)
return content, save_message
return content, "File not saved (checkbox not selected)"
# Create the Gradio interface
iface = gr.Interface(
fn=process_url,
inputs=[
gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
gr.Checkbox(label="Save to file", value=False)
],
outputs=[
gr.Textbox(label="Generated llms.txt Content", lines=10),
gr.Textbox(label="Status")
],
title="llms.txt Generator",
description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
examples=[
["https://example.com", False],
["https://docs.python.org", True]
],
theme=gr.themes.Soft()
)
# Launch the app
if __name__ == "__main__":
iface.launch() |