import requests
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
import openpyxl
import gradio as gr
from urllib.parse import urlparse, urljoin

visited_urls = set()
unique_urls = set()

def create_sitemap_from_url(home_page_url):
    def crawl_website(url):
        # Check if URL has already been visited
        if url in visited_urls:
            return

        # Add URL to visited set
        visited_urls.add(url)

        # Extract domain from the given URL
        parsed_url = urlparse(url)
        base_url = parsed_url.scheme + "://" + parsed_url.netloc

        # Make a GET request to the URL
        try:
            response = requests.get(url)
        except requests.exceptions.RequestException:
            # Handle unreadable URLs
            return

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Add the URL to the set of unique URLs
            unique_urls.add(url)

            # Extract all the links on the page
            links = soup.find_all('a')

            # Visit each link
            for link in links:
                href = link.get('href')
                if href and not href.startswith('#'):
                    # Construct the absolute URL by joining the base URL and the relative URL
                    absolute_url = urljoin(url, href)
                    parsed_absolute_url = urlparse(absolute_url)

                    # Check if the URL points to a webpage (excluding image URLs)
                    if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
                        try:
                            # Visit the absolute URL
                            crawl_website(absolute_url)
                        except requests.exceptions.RequestException:
                            # Handle unreadable URLs
                            continue
        else:
            # Handle unsuccessful requests
            return

    # Call the crawl_website function with the desired URL
    crawl_website(home_page_url)

    # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
    final_urls = set()
    for url in unique_urls:
        if url.startswith("http://"):
            remaining_url = url[len("http://"):]
            if "https://" + remaining_url in unique_urls:
                continue
        final_urls.add(url)

    return final_urls

def fetch_and_save_to_excel(home_page_url):
    def fetch_page_info(url):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').get_text() if soup.find('title') else 'No title found'
            keywords = soup.find('meta', {'name': 'keywords'})
            keywords = keywords.get('content') if keywords else 'No keywords found'
            description = soup.find('meta', {'name': 'description'})
            description = description.get('content') if description else 'No description found'
            return title, keywords, description
        return None, None, None

    urls = create_sitemap_from_url(home_page_url)
    if urls:
        title_to_urls = {}  # Dictionary to store URLs grouped by title

        for url in urls:
            title, _, _ = fetch_page_info(url)  # Fetch only title for comparison

            if title in title_to_urls:
                title_to_urls[title].append(url)
            else:
                title_to_urls[title] = [url]

        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.append(["URL", "Title", "Keywords", "Description"])

        for title, urls in title_to_urls.items():
            if len(urls) > 1:  # Only consider titles with multiple URLs
                for url in urls:
                    fetched_title, keywords, description = fetch_page_info(url)
                    sheet.append([url, fetched_title, keywords, description])

        excel_file = "duplicate_titles.xlsx"
        workbook.save(excel_file)
        return excel_file

    return None

# Create a Gradio interface
iface = gr.Interface(
    fn=fetch_and_save_to_excel,
    inputs="text",
    outputs="file",
    title="Duplicate Titles Finder and Excel Exporter",
    description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
    allow_flagging=False,
    examples=[["http://www.embedded-innovations.com/"]]
)

# Launch the Gradio interface
iface.launch()