Spaces:

bonrix
/

url_to_duplicate_title_finder

Runtime error

File size: 4,921 Bytes




import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
import openpyxl
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr

visited_urls = set()
unique_urls = set()

def create_sitemap_from_url(home_page_url):
    def crawl_website(url):
        # Check if URL has already been visited
        if url in visited_urls:
            return

        # Add URL to visited set
        visited_urls.add(url)

        # Extract domain from the given URL
        parsed_url = urlparse(url)
        base_url = parsed_url.scheme + "://" + parsed_url.netloc

        # Make a GET request to the URL
        try:
            response = requests.get(url)
        except requests.exceptions.RequestException:
            # Handle unreadable URLs
            return

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Add the URL to the set of unique URLs
            unique_urls.add(url)

            # Extract all the links on the page
            links = soup.find_all('a')

            # Visit each link
            for link in links:
                href = link.get('href')
                if href and not href.startswith('#'):
                    # Construct the absolute URL by joining the base URL and the relative URL
                    absolute_url = urljoin(url, href)
                    parsed_absolute_url = urlparse(absolute_url)

                    # Check if the URL points to a webpage (excluding image URLs)
                    if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
                        try:
                            # Visit the absolute URL
                            crawl_website(absolute_url)
                        except requests.exceptions.RequestException:
                            # Handle unreadable URLs
                            continue
        else:
            # Handle unsuccessful requests
            return

    # Call the crawl_website function with the desired URL
    crawl_website(home_page_url)

    # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
    final_urls = set()
    for url in unique_urls:
        if url.startswith("http://"):
            remaining_url = url[len("http://"):]
            if "https://" + remaining_url in unique_urls:
                continue
        final_urls.add(url)

    return final_urls

def fetch_and_save_to_excel(home_page_url):
    def fetch_page_info(url):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').get_text() if soup.find('title') else 'No title found'
            keywords = soup.find('meta', {'name': 'keywords'})
            keywords = keywords.get('content') if keywords else 'No keywords found'
            description = soup.find('meta', {'name': 'description'})
            description = description.get('content') if description else 'No description found'
            return title, keywords, description
        return None, None, None

    urls = create_sitemap_from_url(home_page_url)
    if urls:
        title_to_urls = {}  # Dictionary to store URLs grouped by title

        for url in urls:
            title, _, _ = fetch_page_info(url)  # Fetch only title for comparison

            if title in title_to_urls:
                title_to_urls[title].append(url)
            else:
                title_to_urls[title] = [url]

        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.append(["URL", "Title", "Keywords", "Description"])

        for title, urls in title_to_urls.items():
            if len(urls) > 1:  # Only consider titles with multiple URLs
                for url in urls:
                    fetched_title, keywords, description = fetch_page_info(url)
                    sheet.append([url, fetched_title, keywords, description])

        excel_file = "duplicate_titles.xlsx"
        workbook.save(excel_file)
        return excel_file

    return None

# Create a Gradio interface
iface = gr.Interface(
    fn=fetch_and_save_to_excel,
    inputs="text",
    outputs="file",
    title="Duplicate Titles Finder and Excel Exporter",
    description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
    allow_flagging=False,
    examples=[["http://www.embedded-innovations.com/"]]
)

# Launch the Gradio interface
iface.launch()