import gradio as gr import requests from bs4 import BeautifulSoup import re from urllib.parse import urljoin, urlparse import asyncio from collections import defaultdict import unicodedata import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class WebsiteCrawler: def __init__(self, max_depth=3, max_pages=50): self.max_depth = max_depth self.max_pages = max_pages self.visited_urls = set() self.url_metadata = defaultdict(dict) self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } def clean_text(self, text, is_title=False): """Clean and normalize text""" if not text: return "" # Normalize unicode characters text = unicodedata.normalize('NFKD', text) text = re.sub(r'[^\x00-\x7F]+', '', text) if is_title: # Remove common suffixes and fragments for titles text = re.sub(r'\s*[\|\-#:•].*', '', text) text = re.sub(r'^\s*Welcome to\s+', '', text) text = text.replace('docusaurus_skipToContent_fallback', '') return ' '.join(text.split()).strip() async def crawl_page(self, url, depth, base_domain): """Crawl a single page and extract information""" if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages: return [] try: response = requests.get(url, headers=self.headers, timeout=10) response.encoding = 'utf-8' self.visited_urls.add(url) soup = BeautifulSoup(response.text, 'html.parser') # Extract metadata title = ( soup.find('meta', property='og:title') or soup.find('title') or soup.find('h1') ) title = self.clean_text(title.text if title else url.split('/')[-1], is_title=True) desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description') desc = self.clean_text(desc['content'] if desc else '') # Determine category and importance url_lower = url.lower() category = 'Optional' importance = 0 if 'docs' in url_lower or 'documentation' in url_lower: category = 'Docs' importance = 5 elif 'api' in url_lower: category = 'API' importance = 4 # Store metadata clean_url = re.sub(r'#.*', '', url).rstrip('/') self.url_metadata[clean_url] = { 'title': title, 'description': desc, 'category': category, 'importance': importance } # Find links return [ urljoin(url, a['href']) for a in soup.find_all('a', href=True) if not any(x in a['href'].lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']) ] except Exception as e: logger.error(f"Error crawling {url}: {str(e)}") return [] async def crawl_website(self, start_url): """Crawl website starting from the given URL""" base_domain = urlparse(start_url).netloc queue = [(start_url, 0)] seen = {start_url} while queue and len(self.visited_urls) < self.max_pages: current_url, depth = queue.pop(0) if depth > self.max_depth: continue links = await self.crawl_page(current_url, depth, base_domain) for link in links: if link not in seen and urlparse(link).netloc == base_domain: seen.add(link) queue.append((link, depth + 1)) def generate_llms_txt(self): """Generate llms.txt content""" if not self.url_metadata: return "No content was found to generate llms.txt" # Sort and filter URLs sorted_urls = sorted( self.url_metadata.items(), key=lambda x: (x[1]['importance'], x[0]), reverse=True ) # Generate content content = [] main_metadata = sorted_urls[0][1] content.append(f"# {main_metadata['title']}") if main_metadata['description']: content.append(f"\n> {main_metadata['description']}") # Group by category categories = defaultdict(list) seen_titles = set() for url, metadata in sorted_urls: title = metadata['title'] if title not in seen_titles: categories[metadata['category']].append((url, metadata)) seen_titles.add(title) # Add sections for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']: if category in categories: content.append(f"\n## {category}") for url, metadata in categories[category]: if metadata['description']: content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}") else: content.append(f"\n- [{metadata['title']}]({url})") return "\n".join(content) async def process_url(url, max_depth, max_pages): """Process URL and generate llms.txt""" try: # Add https:// if not present if not url.startswith(('http://', 'https://')): url = 'https://' + url # Validate URL result = urlparse(url) if not all([result.scheme, result.netloc]): return "", "Invalid URL format. Please enter a valid URL." # Process website crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages)) await crawler.crawl_website(url) content = crawler.generate_llms_txt() return content, f"Successfully crawled {len(crawler.visited_urls)} pages." except Exception as e: return "", f"Error: {str(e)}" # Create Gradio interface theme = gr.themes.Soft(primary_hue="blue", font="Open Sans") with gr.Blocks(theme=theme, css=""" .primary-btn {background-color: #2436d4 !important;} .primary-btn:hover {background-color: #1c2aa8 !important;} """) as iface: gr.Markdown("# llms.txt Generator") gr.Markdown("Generate an llms.txt file from a website following the specification.") with gr.Row(): url_input = gr.Textbox( label="Website URL", placeholder="Enter the website URL (e.g., example.com)", info="The URL will be automatically prefixed with https:// if not provided" ) with gr.Row(): with gr.Column(): depth_input = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth") with gr.Column(): pages_input = gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages") generate_btn = gr.Button("Generate llms.txt", variant="primary") output = gr.Textbox( label="Generated llms.txt Content", lines=20, show_copy_button=True, container=True ) status = gr.Textbox(label="Status") generate_btn.click( fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)), inputs=[url_input, depth_input, pages_input], outputs=[output, status] ) if __name__ == "__main__": iface.launch()