# import requests # from bs4 import BeautifulSoup # import xml.etree.ElementTree as ET # import openpyxl # import gradio as gr # def fetch_page_info(url): # response = requests.get(url) # if response.status_code == 200: # soup = BeautifulSoup(response.text, 'html.parser') # title = soup.find('title').get_text() if soup.find('title') else 'No title found' # keywords = soup.find('meta', {'name': 'keywords'}) # keywords = keywords.get('content') if keywords else 'No keywords found' # description = soup.find('meta', {'name': 'description'}) # description = description.get('content') if description else 'No description found' # return title, keywords, description # return None, None, None # def main_page(sitemap_url): # excel_file = None # if sitemap_url: # response = requests.get(sitemap_url) # if response.status_code == 200: # root = ET.fromstring(response.content) # title_to_urls = {} # Dictionary to store URLs grouped by title # for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): # url = url_element.text # title, _, _ = fetch_page_info(url) # Fetch only title for comparison # if title in title_to_urls: # title_to_urls[title].append(url) # else: # title_to_urls[title] = [url] # workbook = openpyxl.Workbook() # sheet = workbook.active # sheet.append(["URL", "Title", "Keywords", "Description"]) # for title, urls in title_to_urls.items(): # if len(urls) > 1: # Only consider titles with multiple URLs # for url in urls: # fetched_title, keywords, description = fetch_page_info(url) # sheet.append([url, fetched_title, keywords, description]) # excel_file = "duplicate_titles.xlsx" # workbook.save(excel_file) # return excel_file # iface = gr.Interface( # fn=main_page, # inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")], # outputs="file", # live=True, # title="Duplicate Titles Finder and Excel Exporter", # description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.", # examples=[["http://www.embedded-innovations.com/sitemap.xml"]] # ) # if __name__ == "__main__": # iface.launch() import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin import pandas as pd from difflib import SequenceMatcher from xml.etree import ElementTree as ET import openpyxl from openpyxl import Workbook from openpyxl.styles import PatternFill from openpyxl.utils.dataframe import dataframe_to_rows import gradio as gr def fetch_and_save_to_excel(sitemap_url): def fetch_page_info(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('title').get_text() if soup.find('title') else 'No title found' keywords = soup.find('meta', {'name': 'keywords'}) keywords = keywords.get('content') if keywords else 'No keywords found' description = soup.find('meta', {'name': 'description'}) description = description.get('content') if description else 'No description found' return title, keywords, description return None, None, None if sitemap_url: response = requests.get(sitemap_url) if response.status_code == 200: root = ET.fromstring(response.content) title_to_urls = {} # Dictionary to store URLs grouped by title for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): url = url_element.text title, _, _ = fetch_page_info(url) # Fetch only title for comparison if title in title_to_urls: title_to_urls[title].append(url) else: title_to_urls[title] = [url] workbook = openpyxl.Workbook() sheet = workbook.active sheet.append(["URL", "Title", "Keywords", "Description"]) for title, urls in title_to_urls.items(): if len(urls) > 1: # Only consider titles with multiple URLs for url in urls: fetched_title, keywords, description = fetch_page_info(url) sheet.append([url, fetched_title, keywords, description]) excel_file = "duplicate_titles.xlsx" workbook.save(excel_file) return excel_file return None # Create a Gradio interface iface = gr.Interface( fn=fetch_and_save_to_excel, inputs="text", outputs="file", title="Duplicate Titles Finder and Excel Exporter", description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.", allow_flagging=False, examples=[["http://www.embedded-innovations.com/sitemap.xml"]] ) # Launch the Gradio interface iface.launch()