bonrix's picture
Update app.py
6bb850b
raw
history blame contribute delete
No virus
4.92 kB
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
import openpyxl
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr
visited_urls = set()
unique_urls = set()
def create_sitemap_from_url(home_page_url):
def crawl_website(url):
# Check if URL has already been visited
if url in visited_urls:
return
# Add URL to visited set
visited_urls.add(url)
# Extract domain from the given URL
parsed_url = urlparse(url)
base_url = parsed_url.scheme + "://" + parsed_url.netloc
# Make a GET request to the URL
try:
response = requests.get(url)
except requests.exceptions.RequestException:
# Handle unreadable URLs
return
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Add the URL to the set of unique URLs
unique_urls.add(url)
# Extract all the links on the page
links = soup.find_all('a')
# Visit each link
for link in links:
href = link.get('href')
if href and not href.startswith('#'):
# Construct the absolute URL by joining the base URL and the relative URL
absolute_url = urljoin(url, href)
parsed_absolute_url = urlparse(absolute_url)
# Check if the URL points to a webpage (excluding image URLs)
if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
try:
# Visit the absolute URL
crawl_website(absolute_url)
except requests.exceptions.RequestException:
# Handle unreadable URLs
continue
else:
# Handle unsuccessful requests
return
# Call the crawl_website function with the desired URL
crawl_website(home_page_url)
# Remove "http://" URLs that have matching content after "http://" in "https://" URLs
final_urls = set()
for url in unique_urls:
if url.startswith("http://"):
remaining_url = url[len("http://"):]
if "https://" + remaining_url in unique_urls:
continue
final_urls.add(url)
return final_urls
def fetch_and_save_to_excel(home_page_url):
def fetch_page_info(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title').get_text() if soup.find('title') else 'No title found'
keywords = soup.find('meta', {'name': 'keywords'})
keywords = keywords.get('content') if keywords else 'No keywords found'
description = soup.find('meta', {'name': 'description'})
description = description.get('content') if description else 'No description found'
return title, keywords, description
return None, None, None
urls = create_sitemap_from_url(home_page_url)
if urls:
title_to_urls = {} # Dictionary to store URLs grouped by title
for url in urls:
title, _, _ = fetch_page_info(url) # Fetch only title for comparison
if title in title_to_urls:
title_to_urls[title].append(url)
else:
title_to_urls[title] = [url]
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.append(["URL", "Title", "Keywords", "Description"])
for title, urls in title_to_urls.items():
if len(urls) > 1: # Only consider titles with multiple URLs
for url in urls:
fetched_title, keywords, description = fetch_page_info(url)
sheet.append([url, fetched_title, keywords, description])
excel_file = "duplicate_titles.xlsx"
workbook.save(excel_file)
return excel_file
return None
# Create a Gradio interface
iface = gr.Interface(
fn=fetch_and_save_to_excel,
inputs="text",
outputs="file",
title="Duplicate Titles Finder and Excel Exporter",
description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
allow_flagging=False,
examples=[["http://www.embedded-innovations.com/"]]
)
# Launch the Gradio interface
iface.launch()