import requests from bs4 import BeautifulSoup from xml.etree import ElementTree as ET import openpyxl import gradio as gr from urllib.parse import urlparse, urljoin visited_urls = set() unique_urls = set() def create_sitemap_from_url(home_page_url): def crawl_website(url): # Check if URL has already been visited if url in visited_urls: return # Add URL to visited set visited_urls.add(url) # Extract domain from the given URL parsed_url = urlparse(url) base_url = parsed_url.scheme + "://" + parsed_url.netloc # Make a GET request to the URL try: response = requests.get(url) except requests.exceptions.RequestException: # Handle unreadable URLs return # Check if the request was successful if response.status_code == 200: # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Add the URL to the set of unique URLs unique_urls.add(url) # Extract all the links on the page links = soup.find_all('a') # Visit each link for link in links: href = link.get('href') if href and not href.startswith('#'): # Construct the absolute URL by joining the base URL and the relative URL absolute_url = urljoin(url, href) parsed_absolute_url = urlparse(absolute_url) # Check if the URL points to a webpage (excluding image URLs) if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')): try: # Visit the absolute URL crawl_website(absolute_url) except requests.exceptions.RequestException: # Handle unreadable URLs continue else: # Handle unsuccessful requests return # Call the crawl_website function with the desired URL crawl_website(home_page_url) # Remove "http://" URLs that have matching content after "http://" in "https://" URLs final_urls = set() for url in unique_urls: if url.startswith("http://"): remaining_url = url[len("http://"):] if "https://" + remaining_url in unique_urls: continue final_urls.add(url) return final_urls def fetch_and_save_to_excel(home_page_url): def fetch_page_info(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('title').get_text() if soup.find('title') else 'No title found' keywords = soup.find('meta', {'name': 'keywords'}) keywords = keywords.get('content') if keywords else 'No keywords found' description = soup.find('meta', {'name': 'description'}) description = description.get('content') if description else 'No description found' return title, keywords, description return None, None, None urls = create_sitemap_from_url(home_page_url) if urls: title_to_urls = {} # Dictionary to store URLs grouped by title for url in urls: title, _, _ = fetch_page_info(url) # Fetch only title for comparison if title in title_to_urls: title_to_urls[title].append(url) else: title_to_urls[title] = [url] workbook = openpyxl.Workbook() sheet = workbook.active sheet.append(["URL", "Title", "Keywords", "Description"]) for title, urls in title_to_urls.items(): if len(urls) > 1: # Only consider titles with multiple URLs for url in urls: fetched_title, keywords, description = fetch_page_info(url) sheet.append([url, fetched_title, keywords, description]) excel_file = "duplicate_titles.xlsx" workbook.save(excel_file) return excel_file return None # Create a Gradio interface iface = gr.Interface( fn=fetch_and_save_to_excel, inputs="text", outputs="file", title="Duplicate Titles Finder and Excel Exporter", description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.", allow_flagging=False, examples=[["http://www.embedded-innovations.com/"]] ) # Launch the Gradio interface iface.launch()