Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import xml.etree.ElementTree as ET | |
import openpyxl | |
import gradio as gr | |
def fetch_page_info(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
title = soup.find('title').get_text() if soup.find('title') else 'No title found' | |
keywords = soup.find('meta', {'name': 'keywords'}) | |
keywords = keywords.get('content') if keywords else 'No keywords found' | |
description = soup.find('meta', {'name': 'description'}) | |
description = description.get('content') if description else 'No description found' | |
return title, keywords, description | |
return None, None, None | |
def main_page(sitemap_url): | |
excel_file = None | |
if sitemap_url: | |
response = requests.get(sitemap_url) | |
if response.status_code == 200: | |
root = ET.fromstring(response.content) | |
title_to_urls = {} # Dictionary to store URLs grouped by title | |
for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc"): | |
url = url_element.text | |
title, _, _ = fetch_page_info(url) # Fetch only title for comparison | |
if title in title_to_urls: | |
title_to_urls[title].append(url) | |
else: | |
title_to_urls[title] = [url] | |
workbook = openpyxl.Workbook() | |
sheet = workbook.active | |
sheet.append(["URL", "Title", "Keywords", "Description"]) | |
for title, urls in title_to_urls.items(): | |
if len(urls) > 1: # Only consider titles with multiple URLs | |
for url in urls: | |
fetched_title, keywords, description = fetch_page_info(url) | |
sheet.append([url, fetched_title, keywords, description]) | |
excel_file = "duplicate_titles.xlsx" | |
workbook.save(excel_file) | |
return excel_file | |
iface = gr.Interface( | |
fn=main_page, | |
inputs=[gr.inputs.Textbox(placeholder="Enter sitemap URL here")], | |
outputs="file", | |
live=True, | |
title="Duplicate Titles Finder and Excel Exporter", | |
description="Enter a sitemap URL to find duplicate titles and export the results to an Excel file.", | |
examples=[["http://www.embedded-innovations.com/sitemap.xml"]] | |
) | |
if __name__ == "__main__": | |
iface.launch() | |