Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse, urljoin | |
import pandas as pd | |
from difflib import SequenceMatcher | |
from xml.etree import ElementTree as ET | |
import openpyxl | |
from openpyxl import Workbook | |
from openpyxl.styles import PatternFill | |
from openpyxl.utils.dataframe import dataframe_to_rows | |
import gradio as gr | |
visited_urls = set() | |
unique_urls = set() | |
def create_sitemap_from_url(home_page_url): | |
def crawl_website(url): | |
# Check if URL has already been visited | |
if url in visited_urls: | |
return | |
# Add URL to visited set | |
visited_urls.add(url) | |
# Extract domain from the given URL | |
parsed_url = urlparse(url) | |
base_url = parsed_url.scheme + "://" + parsed_url.netloc | |
# Make a GET request to the URL | |
try: | |
response = requests.get(url) | |
except requests.exceptions.RequestException: | |
# Handle unreadable URLs | |
return | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Add the URL to the set of unique URLs | |
unique_urls.add(url) | |
# Extract all the links on the page | |
links = soup.find_all('a') | |
# Visit each link | |
for link in links: | |
href = link.get('href') | |
if href and not href.startswith('#'): | |
# Construct the absolute URL by joining the base URL and the relative URL | |
absolute_url = urljoin(url, href) | |
parsed_absolute_url = urlparse(absolute_url) | |
# Check if the URL points to a webpage (excluding image URLs) | |
if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')): | |
try: | |
# Visit the absolute URL | |
crawl_website(absolute_url) | |
except requests.exceptions.RequestException: | |
# Handle unreadable URLs | |
continue | |
else: | |
# Handle unsuccessful requests | |
return | |
# Call the crawl_website function with the desired URL | |
crawl_website(home_page_url) | |
# Remove "http://" URLs that have matching content after "http://" in "https://" URLs | |
final_urls = set() | |
for url in unique_urls: | |
if url.startswith("http://"): | |
remaining_url = url[len("http://"):] | |
if "https://" + remaining_url in unique_urls: | |
continue | |
final_urls.add(url) | |
return final_urls | |
def fetch_and_save_to_excel(home_page_url): | |
def fetch_page_info(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
title = soup.find('title').get_text() if soup.find('title') else 'No title found' | |
keywords = soup.find('meta', {'name': 'keywords'}) | |
keywords = keywords.get('content') if keywords else 'No keywords found' | |
description = soup.find('meta', {'name': 'description'}) | |
description = description.get('content') if description else 'No description found' | |
return title, keywords, description | |
return None, None, None | |
urls = create_sitemap_from_url(home_page_url) | |
if urls: | |
title_to_urls = {} # Dictionary to store URLs grouped by title | |
for url in urls: | |
title, _, _ = fetch_page_info(url) # Fetch only title for comparison | |
if title in title_to_urls: | |
title_to_urls[title].append(url) | |
else: | |
title_to_urls[title] = [url] | |
workbook = openpyxl.Workbook() | |
sheet = workbook.active | |
sheet.append(["URL", "Title", "Keywords", "Description"]) | |
for title, urls in title_to_urls.items(): | |
if len(urls) > 1: # Only consider titles with multiple URLs | |
for url in urls: | |
fetched_title, keywords, description = fetch_page_info(url) | |
sheet.append([url, fetched_title, keywords, description]) | |
excel_file = "duplicate_titles.xlsx" | |
workbook.save(excel_file) | |
return excel_file | |
return None | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=fetch_and_save_to_excel, | |
inputs="text", | |
outputs="file", | |
title="Duplicate Titles Finder and Excel Exporter", | |
description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.", | |
allow_flagging=False, | |
examples=[["http://www.embedded-innovations.com/"]] | |
) | |
# Launch the Gradio interface | |
iface.launch() | |