# import requests
# from bs4 import BeautifulSoup
# import xml.etree.ElementTree as ET
# import openpyxl
# import gradio as gr

# def fetch_page_info(url):
#     response = requests.get(url)
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         title = soup.find('title').get_text() if soup.find('title') else 'No title found'
#         keywords = soup.find('meta', {'name': 'keywords'})
#         keywords = keywords.get('content') if keywords else 'No keywords found'
#         description = soup.find('meta', {'name': 'description'})
#         description = description.get('content') if description else 'No description found'
#         return title, keywords, description
#     return None, None, None

# def main_page(sitemap_url):
#     excel_file = None
#     if sitemap_url:
#         response = requests.get(sitemap_url)
#         if response.status_code == 200:
#             root = ET.fromstring(response.content)

#             title_to_urls = {}  # Dictionary to store URLs grouped by title

#             for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
#                 url = url_element.text
#                 title, _, _ = fetch_page_info(url)  # Fetch only title for comparison

#                 if title in title_to_urls:
#                     title_to_urls[title].append(url)
#                 else:
#                     title_to_urls[title] = [url]

#             workbook = openpyxl.Workbook()
#             sheet = workbook.active
#             sheet.append(["URL", "Title", "Keywords", "Description"])

#             for title, urls in title_to_urls.items():
#                 if len(urls) > 1:  # Only consider titles with multiple URLs
#                     for url in urls:
#                         fetched_title, keywords, description = fetch_page_info(url)
#                         sheet.append([url, fetched_title, keywords, description])

#             excel_file = "duplicate_titles.xlsx"
#             workbook.save(excel_file)

#     return excel_file

# iface = gr.Interface(
#     fn=main_page,
#     inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")],
#     outputs="file",
#     live=True,
    # title="Duplicate Titles Finder and Excel Exporter",
    # description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
#     examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
# )

# if __name__ == "__main__":
#     iface.launch()


import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
import openpyxl
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr


def fetch_and_save_to_excel(sitemap_url):
    def fetch_page_info(url):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').get_text() if soup.find('title') else 'No title found'
            keywords = soup.find('meta', {'name': 'keywords'})
            keywords = keywords.get('content') if keywords else 'No keywords found'
            description = soup.find('meta', {'name': 'description'})
            description = description.get('content') if description else 'No description found'
            return title, keywords, description
        return None, None, None

    if sitemap_url:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            root = ET.fromstring(response.content)

            title_to_urls = {}  # Dictionary to store URLs grouped by title

            for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
                url = url_element.text
                title, _, _ = fetch_page_info(url)  # Fetch only title for comparison

                if title in title_to_urls:
                    title_to_urls[title].append(url)
                else:
                    title_to_urls[title] = [url]

            workbook = openpyxl.Workbook()
            sheet = workbook.active
            sheet.append(["URL", "Title", "Keywords", "Description"])

            for title, urls in title_to_urls.items():
                if len(urls) > 1:  # Only consider titles with multiple URLs
                    for url in urls:
                        fetched_title, keywords, description = fetch_page_info(url)
                        sheet.append([url, fetched_title, keywords, description])

            excel_file = "duplicate_titles.xlsx"
            workbook.save(excel_file)
            return excel_file

    return None


# Create a Gradio interface
iface = gr.Interface(
    fn=fetch_and_save_to_excel,
    inputs="text",
    outputs="file",
    title="Duplicate Titles Finder and Excel Exporter",
    description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.",
    allow_flagging=False,
    examples=[["http://www.embedded-innovations.com/sitemap.xml"]]
)

# Launch the Gradio interface
iface.launch()