import gradio as gr import advertools as adv import pandas as pd import re from secrets import token_hex import logging import os logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def safe_crawl(url, output_file): """Safely perform web crawl with timeout""" try: adv.crawl(url, output_file, follow_links=False, # Only crawl the main page custom_settings={'CLOSESPIDER_TIMEOUT': 30}) # 30 second timeout return True except Exception as e: logger.error(f"Crawl error: {str(e)}") return False def explode_link_df(crawl_df, col_group): """Process links from a specific column group in the crawl dataframe""" try: link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode() text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode() all_links = [] for link, text in zip(link.dropna(), text.dropna()): if text and text.strip(): text = re.sub(r"\n+", " ", text.strip(), re.DOTALL) text = re.sub(r"\s{3,}", " ", text) all_links.append("\n".join(['## ' + text, f"[{text}]({link})"])) return "\n\n".join(all_links) except Exception as e: logger.error(f"Error processing {col_group} links: {str(e)}") return "" def process_url(url, link_types): """Process URL and generate llms.txt content""" if not url: return "", "Please enter a URL" try: if not url.startswith(("http://", "https://")): url = "https://" + url # Generate unique filename for this crawl output_file = token_hex(6) jsonl_path = f"{output_file}.jsonl" try: # Perform the crawl using advertools if not safe_crawl(url, jsonl_path): return "", "Crawl failed or timed out" # Read the crawl results crawl_df = pd.read_json(jsonl_path, lines=True) # Extract title and meta description title = crawl_df['title'].iloc[0] if not pd.isna(crawl_df['title'].iloc[0]) else "Untitled" meta_desc = crawl_df['meta_desc'].iloc[0] if not pd.isna(crawl_df['meta_desc'].iloc[0]) else "" all_links = [] # Process links based on selected types if link_types and "All links" not in link_types: for link_type in link_types: type_match = re.findall(r"header|footer|nav", link_type) if type_match: link_content = explode_link_df(crawl_df, type_match[0]) if link_content: all_links.append(link_content) all_links.append('\n\n') else: # Process all links using advertools link_df = adv.crawlytics.links(crawl_df) for link, text in link_df[['link', 'text']].values: if text and text.strip(): text = re.sub(r"\n+", " ", text.strip(), re.DOTALL) text = re.sub(r"\s{3,}", " ", text) all_links.append("\n".join(['## ' + text, f"[{text}]({link})"])) # Generate final content links_text = "\n\n".join(all_links) final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}" finally: # Cleanup temporary file if os.path.exists(jsonl_path): os.remove(jsonl_path) return final_content, f"Successfully crawled website. Found {len(all_links)} sections." except Exception as e: logger.error(f"Error processing URL {url}: {str(e)}") return "", f"Error: {str(e)}" # Custom CSS for Open Sans font and color theme css = """ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap'); body { font-family: 'Open Sans', sans-serif !important; } .primary-btn { background-color: #3452db !important; } .primary-btn:hover { background-color: #2a41af !important; } """ # Create custom theme with specific color theme = gr.themes.Soft( primary_hue=gr.themes.colors.Color( name="blue", c50="#eef1ff", c100="#e0e5ff", c200="#c3cbff", c300="#a5b2ff", c400="#8798ff", c500="#6a7eff", c600="#3452db", # Main color c700="#2a41af", c800="#1f3183", c900="#152156", c950="#0a102b", ) ) with gr.Blocks(theme=theme, css=css) as iface: with gr.Row(): gr.Markdown("# Generate an `llms.txt` file") with gr.Row(): url_input = gr.Textbox( label="Enter the home page of a website:", placeholder="example: https://example.com", lines=1, ) with gr.Row(): link_types = gr.Dropdown( label="Select types of links to extract (leave empty to get all links)", choices=["
links", "