import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin import pandas as pd from difflib import SequenceMatcher from xml.etree import ElementTree as ET import openpyxl from openpyxl import Workbook from openpyxl.styles import PatternFill from openpyxl.utils.dataframe import dataframe_to_rows import gradio as gr visited_urls = set() unique_urls = set() def create_sitemap_from_url(home_page_url): def crawl_website(url): # Check if URL has already been visited if url in visited_urls: return # Add URL to visited set visited_urls.add(url) # Extract domain from the given URL parsed_url = urlparse(url) base_url = parsed_url.scheme + "://" + parsed_url.netloc # Make a GET request to the URL try: response = requests.get(url) except requests.exceptions.RequestException: # Handle unreadable URLs return # Check if the request was successful if response.status_code == 200: # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Add the URL to the set of unique URLs unique_urls.add(url) # Extract all the links on the page links = soup.find_all('a') # Visit each link for link in links: href = link.get('href') if href and not href.startswith('#'): # Construct the absolute URL by joining the base URL and the relative URL absolute_url = urljoin(url, href) parsed_absolute_url = urlparse(absolute_url) # Check if the URL points to a webpage (excluding image URLs) if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')): try: # Visit the absolute URL crawl_website(absolute_url) except requests.exceptions.RequestException: # Handle unreadable URLs continue else: # Handle unsuccessful requests return # Call the crawl_website function with the desired URL crawl_website(home_page_url) # Remove "http://" URLs that have matching content after "http://" in "https://" URLs final_urls = set() for url in unique_urls: if url.startswith("http://"): remaining_url = url[len("http://"):] if "https://" + remaining_url in unique_urls: continue final_urls.add(url) return final_urls def fetch_and_save_to_excel(home_page_url): def fetch_page_info(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('title').get_text() if soup.find('title') else 'No title found' keywords = soup.find('meta', {'name': 'keywords'}) keywords = keywords.get('content') if keywords else 'No keywords found' description = soup.find('meta', {'name': 'description'}) description = description.get('content') if description else 'No description found' return title, keywords, description return None, None, None urls = create_sitemap_from_url(home_page_url) if urls: title_to_urls = {} # Dictionary to store URLs grouped by title for url in urls: title, _, _ = fetch_page_info(url) # Fetch only title for comparison if title in title_to_urls: title_to_urls[title].append(url) else: title_to_urls[title] = [url] workbook = openpyxl.Workbook() sheet = workbook.active sheet.append(["URL", "Title", "Keywords", "Description"]) for title, urls in title_to_urls.items(): if len(urls) > 1: # Only consider titles with multiple URLs for url in urls: fetched_title, keywords, description = fetch_page_info(url) sheet.append([url, fetched_title, keywords, description]) excel_file = "duplicate_titles.xlsx" workbook.save(excel_file) return excel_file return None # Create a Gradio interface iface = gr.Interface( fn=fetch_and_save_to_excel, inputs="text", outputs="file", title="Duplicate Titles Finder and Excel Exporter", description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.", allow_flagging=False, examples=[["http://www.embedded-innovations.com/"]] ) # Launch the Gradio interface iface.launch()