Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 27, 2024

Commit

b2ecedb

1 Parent(s): 970c25e

update

Browse files

Files changed (1) hide show

app.py +87 -515

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ import aiohttp
 from collections import defaultdict
 import unicodedata
 import logging
-import ssl
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -23,220 +22,44 @@ class WebsiteCrawler:
         self.homepage_metadata = None
         self.headers = {
             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Accept-Encoding": "gzip, deflate, br",
-            "DNT": "1",
-            "Connection": "keep-alive",
-            "Upgrade-Insecure-Requests": "1",
         }
     def determine_category_importance(self, url, title, desc):
-        """Improved category detection"""
         url_lower = url.lower()
         path = urlparse(url).path.lower()
-        # Homepage
         if path == "/" or path == "":
             return "Main", 10
-        # Documentation and Help
-        if any(
-            x in url_lower
-            for x in [
-                "/docs",
-                "/documentation",
-                "/faq",
-                "/help",
-                "frequently-asked-questions",
-            ]
-        ):
             return "Documentation", 8
-        # API and Developer
-        elif any(x in url_lower for x in ["/api", "/developer", "developers"]):
             return "API", 8
-        # About/Company pages
-        elif any(
-            x in url_lower
-            for x in [
-                "/about",
-                "/company",
-                "/references",
-                "/work-with-us",
-                "careers",
-                "/team",
-                "/contact",
-                "/about-us",
-            ]
-        ):
             return "About", 7
-        # News and Events
-        elif any(
-            x in url_lower
-            for x in [
-                "/news",
-                "/blog",
-                "/events",
-                "/press",
-                "research",
-                "power-of",
-                "latest",
-            ]
-        ):
             return "News", 5
-        # Tools and Services
-        elif any(
-            x in url_lower
-            for x in [
-                "/tools",
-                "/quote",
-                "/pricing",
-                "/services",
-                "/translate",
-                "/order",
-                "/buy",
-            ]
-        ):
             return "Tools", 6
-        # Check if URL path contains non-ASCII or percent-encoded characters
-        if bool(re.search(r"[^\x00-\x7F]", path)) or bool(
-            re.search(r"%[0-9A-F]{2}", path)
-        ):
-            return "Optional", 0
         return "Optional", 1
-    def is_duplicate_content(self, desc, title, url):
-        """Improved duplicate/translation detection"""
-        if not desc or not title:
-            return False
-        # Skip non-latin character URLs or URLs with percent-encoded non-ASCII
-        if bool(re.search(r"[^\x00-\x7F]", url)) or bool(
-            re.search(r"%[0-9A-F]{2}", url)
-        ):
-            return True
-        # Skip common translation paths
-        translation_indicators = [
-            "/welcome",
-            "/bienvenue",
-            "/willkommen",
-            "/benvenuto",
-            "/tervetuloa",
-            "/bienvenido",
-            "/velkommen",
-            "/welkom",
-            "translate.com/",
-            "/translate/",
-            "/translation/",
-        ]
-        if any(indicator in url.lower() for indicator in translation_indicators):
-            url_path = urlparse(url).path.lower()
-            if url_path != "/":  # Don't skip homepage
-                return True
-        # Check for similar content length and patterns
-        for existing_metadata in self.url_metadata.values():
-            existing_desc = existing_metadata.get("description", "")
-            existing_title = existing_metadata.get("title", "")
-            if not existing_desc or not existing_title:
-                continue
-            # If descriptions are very similar in length, likely a translation
-            if (
-                abs(len(desc) - len(existing_desc)) < 20
-                and len(desc) > 50
-                and desc != existing_desc
-            ):  # Allow exact duplicates for main page
-                return True
-        return False
     def clean_text(self, text, is_title=False):
-        """Improved text cleaning"""
-        if not text or len(text.strip()) < 2:
             return ""
-        # Normalize unicode characters
         text = unicodedata.normalize("NFKD", text)
         text = re.sub(r"[^\x00-\x7F]+", "", text)
-        # Remove any template variables/placeholders
-        text = re.sub(r"\{\{.*?\}\}", "", text)
-        text = re.sub(r"\{\%.*?\%\}", "", text)
-        text = re.sub(r"\${.*?\}", "", text)
         if is_title:
-            # Remove common suffixes and fragments for titles
             text = re.sub(r"^\s*Welcome to\s+", "", text)
-            text = re.sub(r"\s*[\|\-#:•].*", "", text)
-            text = re.sub(r"\s+Homepage$", "", text, flags=re.IGNORECASE)
-            # Handle overly generic titles
-            if text.lower() in ["features", "home", "homepage", "welcome"]:
-                return ""
-        # Only return if we have meaningful text
-        cleaned = " ".join(text.split()).strip()
-        if len(cleaned.split()) < 2 and not is_title:  # Allow single-word titles
-            return ""
-        return cleaned
-    def clean_description(self, desc):
-        """Clean description text"""
-        if not desc:
-            return ""
-        # Remove leading dashes, hyphens, or colons
-        desc = re.sub(r"^[-:\s]+", "", desc)
-        # Remove any strings that are just "Editors", "APIs", etc.
-        if len(desc.split()) <= 1:
-            return ""
-        return desc.strip()
-    def extract_homepage_description(self, soup):
-        """Extract description from homepage with multiple fallbacks"""
-        # Try meta description first
-        meta_desc = soup.find("meta", {"name": "description"})
-        if meta_desc and meta_desc.get("content"):
-            desc = meta_desc["content"]
-            if desc and len(desc.strip()) > 20:
-                return self.clean_text(desc)
-        # Try OpenGraph description
-        og_desc = soup.find("meta", property="og:description")
-        if og_desc and og_desc.get("content"):
-            desc = og_desc["content"]
-            if desc and len(desc.strip()) > 20:
-                return self.clean_text(desc)
-        # Try first significant paragraph
-        for p in soup.find_all("p"):
-            text = p.get_text().strip()
-            if len(text) > 50 and not any(
-                x in text.lower() for x in ["cookie", "accept", "privacy"]
-            ):
-                return self.clean_text(text)
-        # Try main content area if exists
-        main = soup.find("main")
-        if main:
-            first_p = main.find("p")
-            if first_p:
-                text = first_p.get_text().strip()
-                if len(text) > 50:
-                    return self.clean_text(text)
-        return None
     async def crawl_page(self, url, depth, base_domain):
-        """Crawl a single page and extract information"""
         if (
             depth > self.max_depth
             or url in self.visited_urls
@@ -245,197 +68,91 @@ class WebsiteCrawler:
             return []
         try:
-            await asyncio.sleep(1)  # Be polite to servers
-            async with aiohttp.ClientSession() as session:
                 async with session.get(
                     url, headers=self.headers, allow_redirects=True
                 ) as response:
-                    if response.status == 403:
-                        # Try with alternative headers
-                        alt_headers = {
-                            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
-                            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                        }
-                        async with session.get(
-                            url, headers=alt_headers, allow_redirects=True
-                        ) as retry_response:
-                            if retry_response.status != 200:
-                                return []
-                            text = await retry_response.text()
-                    elif response.status != 200:
                         return []
-                    else:
-                        text = await response.text()
                     self.visited_urls.add(url)
                     soup = BeautifulSoup(text, "html.parser")
-                    # Extract title with fallbacks
-                    title = None
-                    meta_title = soup.find("meta", property="og:title")
-                    if meta_title and meta_title.get("content"):
-                        title = meta_title["content"]
-                    if not title:
-                        title_tag = soup.find("title")
-                        if title_tag:
-                            title = title_tag.text
-                    if not title:
-                        h1_tag = soup.find("h1")
-                        if h1_tag:
-                            title = h1_tag.text
-                    if not title:
-                        title = url.split("/")[-1]
-                    title = self.clean_text(title, is_title=True)
-                    # Extract description with fallbacks
-                    desc = None
-                    meta_desc = soup.find("meta", {"name": "description"})
-                    if meta_desc and meta_desc.get("content"):
-                        desc = meta_desc["content"]
-                    if not desc:
-                        og_desc = soup.find("meta", property="og:description")
-                        if og_desc and og_desc.get("content"):
-                            desc = og_desc["content"]
-                    if not desc:
-                        first_p = soup.find("p")
-                        if first_p:
-                            desc = first_p.text
-                    desc = self.clean_text(desc) if desc else ""
-                    # Skip if it's duplicate content
-                    if self.is_duplicate_content(desc, title, url):
-                        return []
-                    # Determine category and importance
                     category, importance = self.determine_category_importance(
                         url, title, desc
                     )
-                    # Store metadata
-                    clean_url = re.sub(r"#.*", "", url).rstrip("/")
-                    if (
-                        title and len(title.strip()) > 0
-                    ):  # Only store if we have a valid title
-                        logger.info(
-                            f"Storing metadata for {clean_url}: {title[:30]}..."
-                        )
-                        self.url_metadata[clean_url] = {
-                            "title": title,
-                            "description": desc,
-                            "category": category,
-                            "importance": importance,
-                        }
-                    # Find links
                     links = []
                     for a in soup.find_all("a", href=True):
-                        href = a["href"]
-                        if not any(
-                            x in href.lower()
-                            for x in [
-                                "javascript:",
-                                "mailto:",
-                                ".pdf",
-                                ".jpg",
-                                ".png",
-                                ".gif",
-                            ]
-                        ):
-                            next_url = urljoin(url, href)
-                            if urlparse(next_url).netloc == base_domain:
-                                links.append(next_url)
-                    return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
             return []
     async def process_homepage(self, url):
-        """Specifically process the homepage to extract key metadata"""
         try:
-            # Configure SSL context
-            ssl_context = ssl.create_default_context()
-            ssl_context.check_hostname = False
-            ssl_context.verify_mode = ssl.CERT_NONE
-            connector = aiohttp.TCPConnector(ssl=ssl_context)
-            timeout = aiohttp.ClientTimeout(total=30)
             async with aiohttp.ClientSession(
-                connector=connector, timeout=timeout
             ) as session:
                 async with session.get(
                     url, headers=self.headers, allow_redirects=True
                 ) as response:
                     if response.status != 200:
-                        raise Exception(
-                            f"Failed to fetch homepage: status {response.status}"
-                        )
-                    try:
-                        text = await response.text()
-                    except UnicodeDecodeError:
-                        text = await response.read()
-                        text = text.decode("utf-8", errors="ignore")
                     soup = BeautifulSoup(text, "html.parser")
-                    # Extract site name with more fallbacks
-                    site_name = None
-                    # Try meta tags first
-                    site_meta = soup.find("meta", property="og:site_name")
-                    if site_meta and site_meta.get("content"):
-                        site_name = site_meta["content"]
-                    # Try structured data
-                    if not site_name:
-                        schema = soup.find("script", type="application/ld+json")
-                        if schema:
-                            try:
-                                import json
-                                data = json.loads(schema.string)
-                                if isinstance(data, dict):
-                                    site_name = data.get("name") or data.get(
-                                        "organizationName"
-                                    )
-                            except:
-                                pass
-                    # Try title tag
-                    if not site_name:
-                        title_tag = soup.find("title")
-                        if title_tag:
-                            site_name = title_tag.text.split("|")[0].strip()
-                    # Last resort - use domain name
-                    if not site_name:
-                        site_name = urlparse(url).netloc.split(".")[0].capitalize()
-                    # Get homepage description
-                    description = self.extract_homepage_description(soup)
                     self.homepage_metadata = {
                         "site_name": self.clean_text(site_name, is_title=True),
-                        "description": description,
                     }
         except Exception as e:
             logger.error(f"Error processing homepage {url}: {str(e)}")
-            self.homepage_metadata = {
-                "site_name": urlparse(url).netloc.split(".")[0].capitalize(),
-                "description": None,
-            }
     async def crawl_website(self, start_url):
-        """Crawl website starting from the given URL"""
         try:
-            # First process the homepage
-            logger.info(f"Processing homepage: {start_url}")
             await self.process_homepage(start_url)
             base_domain = urlparse(start_url).netloc
             queue = [(start_url, 0)]
             seen = {start_url}
@@ -444,240 +161,95 @@ class WebsiteCrawler:
                 current_url, depth = queue.pop(0)
                 if depth > self.max_depth:
                     continue
-                logger.info(f"Crawling page: {current_url} (depth: {depth})")
                 links = await self.crawl_page(current_url, depth, base_domain)
-                logger.info(f"Found {len(links)} links on {current_url}")
                 for link in links:
-                    if link not in seen and urlparse(link).netloc == base_domain:
                         seen.add(link)
                         queue.append((link, depth + 1))
-            logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
         except Exception as e:
             logger.error(f"Error during crawl: {str(e)}")
             raise
     def generate_llms_txt(self):
-        """Generate llms.txt content"""
-        logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
         if not self.url_metadata:
-            logger.error("No URL metadata found")
-            return "No content was found to generate llms.txt"
-        # Sort URLs by importance and remove duplicates
-        sorted_urls = []
-        seen_titles = set()
-        for url, metadata in sorted(
-            self.url_metadata.items(),
-            key=lambda x: (x[1]["importance"], x[0]),
-            reverse=True,
-        ):
-            if metadata["title"] not in seen_titles:
-                sorted_urls.append((url, metadata))
-                seen_titles.add(metadata["title"])
-        logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
-        if not sorted_urls:
-            logger.error("No valid URLs found after sorting")
-            return "No valid content was found"
-        # Generate content
         content = []
-        # Use homepage metadata for main title and description
-        main_title = self.homepage_metadata.get("site_name", "Welcome")
-        homepage_description = self.homepage_metadata.get("description")
-        logger.info(f"Homepage title: {main_title}")
-        logger.info(f"Homepage description: {homepage_description}")
-        content.append(f"# {main_title}")
-        if homepage_description:
-            content.append(f"\n> {homepage_description}")
-        elif len(sorted_urls) > 0:
-            # Fallback to first good description from content if no homepage description
-            for _, metadata in sorted_urls:
-                desc = self.clean_description(metadata["description"])
-                if desc and len(desc) > 20 and "null" not in desc.lower():
-                    content.append(f"\n> {desc}")
-                    break
-        # Group by category
         categories = defaultdict(list)
-        for url, metadata in sorted_urls:
-            if metadata["title"] and url:
-                categories[metadata["category"]].append((url, metadata))
-        logger.info(f"Categories found: {list(categories.keys())}")
-        # Add sections in a logical order
         category_order = [
             "Main",
             "Documentation",
             "API",
-            "Tools",
             "About",
             "News",
             "Optional",
         ]
-        # Only show Main section if it has content different from the homepage description
-        if "Main" in categories:
-            main_content = categories["Main"]
-            if (
-                len(main_content) == 1
-                and main_content[0][1]["description"] == homepage_description
-            ):
-                logger.info("Removing duplicate Main content")
-                del categories["Main"]
         for category in category_order:
-            if category in categories and categories[category]:
-                logger.info(
-                    f"Processing category {category} with {len(categories[category])} items"
-                )
-                content.append(f"\n## {category}")
-                # Sort links within category by importance and description length
-                category_links = sorted(
-                    categories[category],
-                    key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"]),
-                )
-                links = []
-                seen_desc = set()  # Avoid duplicate descriptions within category
-                for url, metadata in category_links:
-                    title = metadata["title"].strip()
-                    desc = self.clean_description(metadata["description"])
-                    # Skip if description is duplicate within category
-                    if desc in seen_desc:
-                        continue
-                    seen_desc.add(desc)
-                    if desc:
-                        links.append(f"- [{title}]({url}): {desc}")
-                    else:
-                        links.append(f"- [{title}]({url})")
-                content.append("\n".join(links))
-        final_content = "\n".join(content)
-        logger.info(f"Generated content length: {len(final_content)}")
-        return final_content
 async def process_url(url, max_depth, max_pages):
-    """Process URL and generate llms.txt"""
     try:
-        # Add https:// if not present
         if not url.startswith(("http://", "https://")):
             url = "https://" + url
-        # Validate URL
         result = urlparse(url)
-        if not all([result.scheme, result.netloc]):
             return "", "Invalid URL format. Please enter a valid URL."
-        logger.info(f"Starting crawl of {url}")
-        # Process website
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
-        logger.info("Generating llms.txt content")
         content = crawler.generate_llms_txt()
-        if not content or content.strip() == "":
-            return "", "No content was generated. Check the logs for details."
         return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
         logger.error(f"Error processing URL {url}: {str(e)}")
         return "", f"Error: {str(e)}"
-# Create Gradio interface
 theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
-with gr.Blocks(
-    theme=theme,
-    css="""
-    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
-    .gradio-container {
-        font-family: 'Open Sans', sans-serif !important;
-    }
-    .gr-button {
-        font-family: 'Open Sans', sans-serif !important;
-        font-weight: 600 !important;
-    }
-    .primary-btn {
-        background-color: #2436d4 !important;
-        color: white !important;
-    }
-    .primary-btn:hover {
-        background-color: #1c2aa8 !important;
-    }
-    [data-testid="textbox"] {
-        font-family: 'Open Sans', sans-serif !important;
-    }
-    .gr-padded {
-        font-family: 'Open Sans', sans-serif !important;
-    }
-    .gr-input {
-        font-family: 'Open Sans', sans-serif !important;
-    }
-    .gr-label {
-        font-family: 'Open Sans', sans-serif !important;
-    }
-""",
-) as iface:
     with gr.Row():
         url_input = gr.Textbox(
-            label="Website URL",
-            placeholder="Enter the website URL (e.g., example.com)",
-            info="The URL will be automatically prefixed with https:// if not provided",
         )
     with gr.Row():
-        with gr.Column():
-            depth_input = gr.Slider(
-                minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
-            )
-        with gr.Column():
-            pages_input = gr.Slider(
-                minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
-            )
-    generate_btn = gr.Button("Generate llms.txt", variant="primary")
     output = gr.Textbox(
-        label="Generated llms.txt Content",
-        lines=20,
-        show_copy_button=True,
-        container=True,
     )
     status = gr.Textbox(label="Status")
     generate_btn.click(
-        fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
         inputs=[url_input, depth_input, pages_input],
         outputs=[output, status],
     )

 from collections import defaultdict
 import unicodedata
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         self.homepage_metadata = None
         self.headers = {
             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
         }
     def determine_category_importance(self, url, title, desc):
         url_lower = url.lower()
         path = urlparse(url).path.lower()
         if path == "/" or path == "":
             return "Main", 10
+        if any(x in url_lower for x in ["/docs", "/faq", "/help"]):
             return "Documentation", 8
+        elif any(x in url_lower for x in ["/api", "/developer"]):
             return "API", 8
+        elif any(x in url_lower for x in ["/about", "/company", "/contact"]):
             return "About", 7
+        elif any(x in url_lower for x in ["/news", "/blog", "/events"]):
             return "News", 5
+        elif any(x in url_lower for x in ["/tools", "/pricing"]):
             return "Tools", 6
         return "Optional", 1
     def clean_text(self, text, is_title=False):
+        if not text:
             return ""
         text = unicodedata.normalize("NFKD", text)
         text = re.sub(r"[^\x00-\x7F]+", "", text)
+        text = " ".join(text.split()).strip()
         if is_title:
             text = re.sub(r"^\s*Welcome to\s+", "", text)
+        return text
     async def crawl_page(self, url, depth, base_domain):
         if (
             depth > self.max_depth
             or url in self.visited_urls
             return []
         try:
+            async with aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=20)
+            ) as session:
                 async with session.get(
                     url, headers=self.headers, allow_redirects=True
                 ) as response:
+                    if response.status != 200:
                         return []
+                    text = await response.text()
                     self.visited_urls.add(url)
                     soup = BeautifulSoup(text, "html.parser")
+                    title_tag = soup.find("title")
+                    title = (
+                        self.clean_text(title_tag.text)
+                        if title_tag
+                        else url.split("/")[-1]
+                    )
+                    desc_tag = soup.find("meta", {"name": "description"})
+                    desc = (
+                        self.clean_text(desc_tag["content"])
+                        if desc_tag and desc_tag.get("content")
+                        else ""
+                    )
                     category, importance = self.determine_category_importance(
                         url, title, desc
                     )
+                    self.url_metadata[url] = {
+                        "title": title,
+                        "description": desc,
+                        "category": category,
+                        "importance": importance,
+                    }
                     links = []
                     for a in soup.find_all("a", href=True):
+                        next_url = urljoin(url, a["href"])
+                        if urlparse(next_url).netloc == base_domain:
+                            links.append(next_url)
+                    return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
             return []
     async def process_homepage(self, url):
         try:
             async with aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=20)
             ) as session:
                 async with session.get(
                     url, headers=self.headers, allow_redirects=True
                 ) as response:
                     if response.status != 200:
+                        return
+                    text = await response.text()
                     soup = BeautifulSoup(text, "html.parser")
+                    site_name = (
+                        soup.find("title").text.split("|")[0].strip()
+                        if soup.find("title")
+                        else urlparse(url).netloc
+                    )
+                    description = soup.find("meta", {"name": "description"})
+                    description = (
+                        description["content"].strip()
+                        if description and description.get("content")
+                        else None
+                    )
                     self.homepage_metadata = {
                         "site_name": self.clean_text(site_name, is_title=True),
+                        "description": (
+                            self.clean_text(description) if description else None
+                        ),
                     }
         except Exception as e:
             logger.error(f"Error processing homepage {url}: {str(e)}")
     async def crawl_website(self, start_url):
         try:
             await self.process_homepage(start_url)
             base_domain = urlparse(start_url).netloc
             queue = [(start_url, 0)]
             seen = {start_url}
                 current_url, depth = queue.pop(0)
                 if depth > self.max_depth:
                     continue
                 links = await self.crawl_page(current_url, depth, base_domain)
                 for link in links:
+                    if link not in seen:
                         seen.add(link)
                         queue.append((link, depth + 1))
         except Exception as e:
             logger.error(f"Error during crawl: {str(e)}")
             raise
     def generate_llms_txt(self):
         if not self.url_metadata:
+            return "No content available."
         content = []
+        homepage_title = self.homepage_metadata.get("site_name", "Website")
+        homepage_description = self.homepage_metadata.get(
+            "description", "No description available."
+        )
+        content.append(f"# {homepage_title}\n\n> {homepage_description}\n")
         categories = defaultdict(list)
+        for url, metadata in self.url_metadata.items():
+            categories[metadata["category"]].append((url, metadata))
         category_order = [
             "Main",
             "Documentation",
             "API",
             "About",
             "News",
+            "Tools",
             "Optional",
         ]
         for category in category_order:
+            if category in categories:
+                content.append(f"## {category}")
+                for url, metadata in categories[category]:
+                    content.append(
+                        f"- [{metadata['title']}]({url}): {metadata['description']}"
+                    )
+        return "\n".join(content)
 async def process_url(url, max_depth, max_pages):
     try:
         if not url.startswith(("http://", "https://")):
             url = "https://" + url
         result = urlparse(url)
+        if not result.scheme or not result.netloc:
             return "", "Invalid URL format. Please enter a valid URL."
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
         return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
         logger.error(f"Error processing URL {url}: {str(e)}")
         return "", f"Error: {str(e)}"
+# Gradio interface
 theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
+with gr.Blocks(theme=theme) as iface:
     with gr.Row():
         url_input = gr.Textbox(
+            label="Website URL", placeholder="Enter the website URL (e.g., example.com)"
         )
     with gr.Row():
+        depth_input = gr.Slider(
+            minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
+        )
+        pages_input = gr.Slider(
+            minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
+        )
+    generate_btn = gr.Button("Generate llms.txt")
     output = gr.Textbox(
+        label="Generated llms.txt Content", lines=20, show_copy_button=True
     )
     status = gr.Textbox(label="Status")
+    async def process_url_async_wrapper(url, depth, pages):
+        return await process_url(url, depth, pages)
     generate_btn.click(
+        fn=process_url_async_wrapper,
         inputs=[url_input, depth_input, pages_input],
         outputs=[output, status],
     )